Ejemplo n.º 1
0
class KafkaHandler(logging.Handler):
    def __init__(self, host, *args, **kwargs):
        super(KafkaHandler, self).__init__(*args, **kwargs)
        self.kafka_producer = KafkaProducer(bootstrap_servers=host)

    def emit(self, record):
        message = self.format(record)
        event_dict = {
            'klog_level': record.levelname.upper(),
            'klog_time': record.created,
            'klog_message': message,
        }

        for attribute, value in six.iteritems(vars(record)):
            event_dict[attribute] = value

        json_dump = json.dumps(event_dict)

        self.kafka_producer.send(str(record.name).encode('utf-8') + '.json',
                           json_dump.encode('utf-8'))
        self.kafka_producer.send('all.json', json_dump.encode('utf-8'))

        self.kafka_producer.send(str(record.name).encode('utf-8') + '.txt',
                           message.encode('utf-8'))
        self.kafka_producer.send('all.txt', message.encode('utf-8'))

        self.flush()

    def flush(self):
        self.kafka_producer.flush()
Ejemplo n.º 2
0
def main():

	## the topic 
	topic = sys.argv[1]

	## create a Kafka producer with json serializer
	producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'),
							 bootstrap_servers=server)
	print "*** Starting measurements stream on " + server + ", topic : " + topic

	try:
	    while True:
	    	for userId in range(1,200) :
				## Generate random measurements
				meas1 = { "userid":"%d"%userId, "type" : "HR", "value" : getHeartRate()}
				producer.send(topic, meas1, key = b'%d'%userId)

				meas2 = { "userid":"%d"%userId, "type" : "TEMP", "value" : getTemperature()}
				producer.send(topic, meas2, key = b'%d'%userId)

				meas3 = { "userid":"%d"%userId, "type" : "SBP", "value" : getSystolicBloodPressure()}
				producer.send(topic, meas3, key = b'%d'%userId)

				print "Sending HR   : %s" % (json.dumps(meas1).encode('utf-8'))
				print "Sending TEMP : %s" % (json.dumps(meas2).encode('utf-8'))
				print "Sending BP   : %s" % (json.dumps(meas3).encode('utf-8'))

		sleep(1)

	except KeyboardInterrupt:
	    pass

	    
	print "\nIntercepted user interruption ..\nBlock until all pending messages are sent.."
	producer.flush()
Ejemplo n.º 3
0
def sendSingleMsg2Kafka(msg):
    if not msg:
        return
    producer = KafkaProducer(bootstrap_servers='10.128.184.167:9092')
    producer.send('topic_lpr', msg.encode('utf8'))
    producer.flush()
    producer.close(timeout=5)
def kafka_producer_call():
    kafka_producer = KafkaProducer(bootstrap_servers=KAFKA_SERVER)
    for i in range(NB_MESSAGES):
        word = "yay"
        kafka_producer.send(KAFKA_TOPIC, word)
    kafka_producer.flush()
    return 1
Ejemplo n.º 5
0
class KafkaMessageSender(object):
	
	def __init__(self,config_source):

		self.config_source = config_source
		# config_source = "config/producer_config.yml"

		# load configuration parameters
		config = yaml_loader(self.config_source)

		# initialize parameters
		self.topics = config['topics']
		self.port = config['port']

		self.current_topic = self.topics[0]

		self.producer = KafkaProducer(bootstrap_servers=[self.port])

	def send_message(self,messages):
		for message in messages:
			# self.producer.send(self.current_topic, value = message.strip('[]').splitlines()[0] )
			print message.strip('[]')
			self.producer.send(self.current_topic, value = message.strip('[]') )

			# block until all async messages are sent
			self.producer.flush()
Ejemplo n.º 6
0
def test_end_to_end(kafka_broker):
    connect_str = 'localhost:' + str(kafka_broker.port)
    producer = KafkaProducer(bootstrap_servers=connect_str,
                             max_block_ms=10000,
                             value_serializer=str.encode)
    consumer = KafkaConsumer(bootstrap_servers=connect_str,
                             group_id=None,
                             consumer_timeout_ms=10000,
                             auto_offset_reset='earliest',
                             value_deserializer=bytes.decode)

    topic = random_string(5)

    for i in range(1000):
        producer.send(topic, 'msg %d' % i)
    producer.flush()
    producer.close()

    consumer.subscribe([topic])
    msgs = set()
    for i in range(1000):
        try:
            msgs.add(next(consumer).value)
        except StopIteration:
            break

    assert msgs == set(['msg %d' % i for i in range(1000)])
Ejemplo n.º 7
0
    def run(self, run_time):
        """
        Send checkresults to Kafka Topic
        """
        logging.debug("Establishing passive handler: Kafka")
        super(Handler, self).run()
        itemlist = []
        for check in self.checks:
            if check.needs_to_run():
                item = self.do_check(check)
                item.check_time = run_time
                check.set_next_run(run_time)
                item.hostname = self.get_kafka_hostname(item)
                itemlist.append(item)

        if len(itemlist) > 0:
            try:
                logging.info('Connect to Kafka Server')
                producer = KafkaProducer(bootstrap_servers=['{}'.format(self.str_kafakhosts)], client_id=self.str_client_id)
            except KafkaError:
                logging.warn(
                    'Problem to connect Kafka Server: {} with Topic: {} and Clientname {} '.format(self.str_kafakhosts,
                                                                                                   self.str_topic,
                                                                                                   self.str_client_id))
            for item in itemlist:
                producer.send(self.str_topic, key=str(item.hostname), value=json.dumps(self.format_for_kafka(self, item)))

            producer.flush()
Ejemplo n.º 8
0
def get_reddit_submissions(subreddit):
    # Connect to Kafka
    producer = KafkaProducer(bootstrap_servers='kafka:9092')
    # Reddit API
    reddit = authenticate_api()
    
    submissions = 0
    try:
        for submission in reddit.subreddit(subreddit).new():
            sub = format_submission(submission)
            if submissions > 1000:
                break
     
            msg = producer.send('data', json.dumps(sub).encode('utf-8'))
            submissions += 1
            print(submissions)
            with open('test.jsonl', 'a') as f:
                f.write(json.dumps(sub)+'\n') 

        # Flush kafka producer
        producer.flush()
    except Exception as e:
        with open('Errors.txt', 'a') as f:
            f.write(str(type(e))+'\n')
            f.write(str(e)+'\n') 

    # Flush kafka producer                                                  
    producer.flush()
    return subreddit
Ejemplo n.º 9
0
def get_user_tweets(username):
    # Connect to Kafka
    producer = KafkaProducer(bootstrap_servers='kafka:9092')
    # Twitter API
    api = authenticate_api()

    tweets = 0
    need_update = True
    try:
        for page in Cursor(api.user_timeline, screen_name=username, count=200).pages(16):
            for status in page:
                status = status._json
                msg = producer.send('data', json.dumps(format_tweet(status)).encode('utf-8'))
                tweets += 1
                print(tweets)

                with open('test1.jsonl', 'a') as f:
                    f.write(json.dumps(format_tweet(status))+'\n')
            
            # Flush kafka producer
            producer.flush()      
            # Follow Twitter's Rate limit 
            sleep(2)
    except Exception as e:
        print(e)
        pass

    # Flush kafka producer                                              
    producer.flush()
    return username
Ejemplo n.º 10
0
def main():
    """
    A generic Kafka producer for use as a Cylc event handler.

    USAGE:
       cylc_kafka_producer.py <HOST:PORT> <TOPIC> key1=val1 key2=val2 ...
    serializes {key1: val1, key2: val2, ...} to TOPIC at Kafka on HOST:PORT.

    This is generic in that a JSON message schema is defined by the received
    command line keyword arguments. To enforce compliance to a particular
    schema, copy and modify as needed.

    Can be partnered with the generic cylc_kafka_consumer external trigger
    function, for triggering downstream suites.

    """

    if 'help' in sys.argv[1]:
        print cleandoc(main.__doc__)
        sys.exit(0)

    # TODO exception handling for bad inputs etc.
    kafka_server = sys.argv[1]
    kafka_topic = sys.argv[2]
    # Construct a message dict from kwargs.
    dmsg = dict([k.split('=') for k in sys.argv[3:]])

    producer = KafkaProducer(
        bootstrap_servers=kafka_server,
        value_serializer=lambda msg: json.dumps(msg).encode('utf-8'))

    producer.send(kafka_topic, dmsg)
    producer.flush()
Ejemplo n.º 11
0
class SimpleProducer(BaseStreamProducer):
    def __init__(self, location, enable_ssl, cert_path, topic, compression, **kwargs):
        self._location = location
        self._topic = topic
        self._compression = compression
        self._create(enable_ssl, cert_path, **kwargs)

    def _create(self, enable_ssl, cert_path, **kwargs):
        max_request_size = kwargs.pop('max_request_size', DEFAULT_MAX_REQUEST_SIZE)
        kwargs.update(_prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {})
        self._producer = KafkaProducer(bootstrap_servers=self._location,
                                       retries=5,
                                       compression_type=self._compression,
                                       max_request_size=max_request_size,
                                       **kwargs)

    def send(self, key, *messages):
        for msg in messages:
            self._producer.send(self._topic, value=msg)

    def flush(self):
        self._producer.flush()

    def close(self):
        self._producer.close()
class KafkaPythonClient(PythonClient):
    def __init__(self,topic=topic_name, kafkaHost = kafka_host, zookeeperHost=zookeeper_host):
        self.config["topic"] = topic
        self.config["kafkaHost"] = kafkaHost
        self.config["zookeeperHost"] = zookeeperHost
        super(KafkaPythonClient, self).__init__()

    def createProducer(self, kafkaSync):
        self.config["kafkaSync"] = kafkaSync
        self.producer = KafkaProducer(bootstrap_servers=self.config["kafkaHost"])

    def createConsumer(self):
        self.consumer = KafkaConsumer(bootstrap_servers=self.config["kafkaHost"], enable_auto_commit=True, auto_offset_reset='latest',consumer_timeout_ms=1000)
        self.consumer.subscribe([self.config["topic"]])

    def produce(self, num_msg=20000):
        self.msgCount = num_msg
        for x in range (self.msgCount):
            self.prtProgress(x, 10000)
            result = self.producer.send(self.config["topic"], self.msg)
            if self.config["kafkaSync"] == True:
                # block for "synchronous" mode:
                try:
                    result_metadata = result.get(timeout=10)
                except KafkaError:
                    print "*** KAFKA ERROR ***"
                    pass
        if (x >= 10000):
            sys.stdout.write('\n')

    def consume(self, num_msg):
        count = 0
        for message in self.consumer:
            count += 1
            self.prtProgress(count, 10000)
        sys.stdout.write('\n')
        if num_msg >  0:
            if count != num_msg:
                print "ERROR: KafkaPythonClient.consume: # of messages not as expected, read: {}, expected: {}".format(count, num_msg)
        return count

    def startProducer(self): pass

    def stopProducer(self):
        self.beforeFlushTimer(self.timeDict['producer'])
        if self.config["kafkaSync"] == False:
            self.producer.flush()

    def stopConsumer(self): pass

    def initCount(self):
        self.consume(0)
#       for p in self.consumer.partitions_for_topic(self.config['topic']):
#           tp = TopicPartition(self.config['topic'], p)
#           self.consumer.assign([tp])
#           committed = self.consumer.committed(tp)
#           consumer.seek_to_end(tp)

    def finalize(self): pass
Ejemplo n.º 13
0
    def run(self):
        producer = KafkaProducer(bootstrap_servers='localhost:9092')
        self.sent = 0

        while not producer_stop.is_set():
            producer.send('my-topic', self.big_msg)
            self.sent += 1
        producer.flush()
Ejemplo n.º 14
0
    def send_message(self,message,topic_partition):

        self._logger.info("Sending message to: Topic: {0} Partition:{1}".format(self._topic,topic_partition))
        kafka_brokers = '{0}:{1}'.format(self._server,self._port)             
        producer = KafkaProducer(bootstrap_servers=[kafka_brokers],api_version_auto_timeout_ms=3600000)
        future = producer.send(self._topic,message,partition=topic_partition)
        producer.flush()
        producer.close()
Ejemplo n.º 15
0
def sendToKafka(jsonList):
    producer = KafkaProducer(bootstrap_servers=['localhost:9092'], value_serializer=lambda m: json.dumps(m).encode('utf-8'))
    #producer = KafkaProducer(bootstrap_servers='localhost:9092')
#    producer = KafkaProducer(value_serializer=lambda v: json.dumps(v).encode('utf-8'))

    #producer = KafkaProducer(key_serializer=str.encode)
    #producer.send(TOPIC, key='foo123123123', value=b'bar111111111')
    for j in jsonList:
        producer.send(TOPIC,j)
    producer.flush()
Ejemplo n.º 16
0
def kafka_produce_protobuf_messages(topic, start_index, num_messages):
    data = ''
    for i in range(start_index, start_index + num_messages):
        msg = kafka_pb2.KeyValuePair()
        msg.key = i
        msg.value = str(i)
        serialized_msg = msg.SerializeToString()
        data = data + _VarintBytes(len(serialized_msg)) + serialized_msg
    producer = KafkaProducer(bootstrap_servers="localhost:9092")
    producer.send(topic=topic, value=data)
    producer.flush()
    print("Produced {} messages for topic {}".format(num_messages, topic))
Ejemplo n.º 17
0
 def _push(self, payload):
     if super(KafkaService, self)._push(payload):
         LOGGER.info("Pushing payload to kafka: %s", str(payload))
         brokers = self.destination_config['brokers'].split(',')
         topic = self.destination_config['topic']
         kafka_producer = KafkaProducer(bootstrap_servers=brokers)
         for values in payload:
             kafka_producer.send(topic, str(values).encode('utf-8'))
         kafka_producer.flush(3)
         kafka_producer.close(3)
     else:
         LOGGER.warn("Payload is none, nothing to push.")
Ejemplo n.º 18
0
class KafkaSender(LogSender):
    def __init__(self, config, msg_buffer, stats):
        super().__init__(config=config, msg_buffer=msg_buffer, stats=stats,
                         max_send_interval=config.get("max_send_interval", 0.3))
        self.config = config
        self.msg_buffer = msg_buffer
        self.stats = stats
        self.kafka_producer = None
        self.topic = self.config.get("kafka_topic")

    def _init_kafka(self):
        self.log.info("Initializing Kafka client, address: %r", self.config["kafka_address"])
        while self.running:
            try:
                if self.kafka_producer:
                    self.kafka_producer.close()

                self.kafka_producer = KafkaProducer(
                    api_version=self.config.get("kafka_api_version", "0.9"),
                    bootstrap_servers=self.config.get("kafka_address"),
                    compression_type="snappy" if snappy else "gzip",
                    security_protocol="SSL" if self.config.get("ssl") is True else "PLAINTEXT",
                    ssl_cafile=self.config.get("ca"),
                    ssl_certfile=self.config.get("certfile"),
                    ssl_keyfile=self.config.get("keyfile"),
                )
                self.log.info("Initialized Kafka Client, address: %r", self.config["kafka_address"])
                break
            except KAFKA_CONN_ERRORS as ex:
                self.log.warning("Retriable error during Kafka initialization: %s: %s, sleeping",
                                 ex.__class__.__name__, ex)
            self.kafka_producer = None
            time.sleep(5.0)

    def send_messages(self, message_batch):
        if not self.kafka_producer:
            self._init_kafka()
        try:
            for msg in message_batch:
                self.kafka_producer.send(topic=self.topic, value=msg)
            self.kafka_producer.flush()
            return True
        except KAFKA_CONN_ERRORS as ex:
            self.log.info("Kafka retriable error during send: %s: %s, waiting", ex.__class__.__name__, ex)
            time.sleep(0.5)
            self._init_kafka()
        except Exception as ex:  # pylint: disable=broad-except
            self.log.exception("Unexpected exception during send to kafka")
            self.stats.unexpected_exception(ex=ex, where="sender", tags={"app": "journalpump"})
            time.sleep(5.0)
            self._init_kafka()
Ejemplo n.º 19
0
    def run(args):
        try:
            props = {}
            for prop in args.producer_config:
                k, v = prop.split('=')
                try:
                    v = int(v)
                except ValueError:
                    pass
                if v == 'None':
                    v = None
                props[k] = v

            if args.brokers:
                brokers = start_brokers(args.brokers)
                props['bootstrap_servers'] = ['{0}:{1}'.format(broker.host, broker.port)
                                              for broker in brokers]
                print("---> bootstrap_servers={0}".format(props['bootstrap_servers']))
                print()
                print('-> OK!')
                print()

            print('Initializing producer...')
            record = bytes(bytearray(args.record_size))
            props['metrics_sample_window_ms'] = args.stats_interval * 1000

            producer = KafkaProducer(**props)
            for k, v in props.items():
                print('---> {0}={1}'.format(k, v))
            print('---> send {0} byte records'.format(args.record_size))
            print('---> report stats every {0} secs'.format(args.stats_interval))
            print('---> raw metrics? {0}'.format(args.raw_metrics))
            timer_stop = threading.Event()
            timer = StatsReporter(args.stats_interval, producer,
                                  event=timer_stop,
                                  raw_metrics=args.raw_metrics)
            timer.start()
            print('-> OK!')
            print()

            for i in xrange(args.num_records):
                producer.send(topic=args.topic, value=record)
            producer.flush()

            timer_stop.set()

        except Exception:
            exc_info = sys.exc_info()
            traceback.print_exception(*exc_info)
            sys.exit(1)
Ejemplo n.º 20
0
def produce_to_kafka(schema, args, config):
    topic = config['kafka']['topic']
    producer = KafkaProducer(bootstrap_servers = config['kafka']['brokers'])

    def f_produce(topic, partition, key, value):
        producer.send(topic, key = key, value = value, partition = partition)

    partition_count = 1 + max(producer.partitions_for(topic))
    try:
        bootstrap(f_produce, partition_count, schema, args.database, args.table, config)
    except KeyboardInterrupt:
        sys.exit(1)
    producer.flush()
    producer.close()
Ejemplo n.º 21
0
class KafkaProduceServer(object):
    def __init__(self, topic, server):
        if type(server) != list:
            server = [server]
        self._topic=topic
        self._producer = KafkaProducer(bootstrap_servers=server,value_serializer=lambda m: json.dumps(m).encode('ascii'))
    def getProducer(self):
        return self._producer
    def sendMsg(self,msg):
        self._producer.send(self._topic,msg)
        self._producer.flush()
    def sendJson(self,key,json):
        self._producer.send(self._topic,key=key,value=json)
        self._producer.flush()
    def close(self):
        self._producer.close()
Ejemplo n.º 22
0
def push_to_pandas(df):
	import pygeohash
	from cassandra.cluster import Cluster
	from kafka import KafkaProducer
	import  timeit

	cluster = Cluster()
        session = cluster.connect('xweather')

	producer = KafkaProducer(bootstrap_servers=['vm1:9092'])
	name=multiprocessing.current_process().name
	#df = pd.read_csv(filename)
	df1= df[['id','lat','lon','src','elev','timezone','tzoffset']].drop_duplicates()
	df1.src.fillna('NA')
	# Adding Geohash Id
	df1['geohash_id']=df.apply(lambda row:pygeohash.encode(row['lat'],row['lon']),axis=1)

	
	#Now loop through the Dataframe
	for row in df1.itertuples():
	  j = ','.join((row[8],str(row[1]),str(row[5]),row[8][:3],str(row[2]),str(row[3]),str(row[4]),str(row[6]),str(row[7])))
	  future = producer.send('topic-weather-stations',j)
 	  
	print('Completed insert into weather stations',name)
	
	#Now to the facts
	#Remove the descriptive columns
	df.drop(df.columns[[1,2,3,4,5,6]],axis=1,inplace=True)
	
	#Unpivot the dataset
	df=pd.melt(df,id_vars=['id','timestamp','dateTime'])
	df=df.dropna()
	# Kafka it
	ctr =0;
	producer = KafkaProducer(bootstrap_servers=['vm1:9092'],batch_size=20000,linger_ms=50,buffer_memory=952108864)
	#producer = KafkaProducer(bootstrap_servers=['vm1:9092'])
	start_time = timeit.default_timer()
	for row in df.itertuples():
	   k=list(row)
	   k=k[1:]
	   j= ','.join(str(x) for x in k)
	   future = producer.send('topic-weather-data',j)
           ctr+=1
        print('Producer timing is ', name,timeit.default_timer() - start_time,'Rows:',ctr)
	producer.flush()
	producer.close()
Ejemplo n.º 23
0
def send2Kafka(msgs):
    if not msgs:
        return
    producer = KafkaProducer(bootstrap_servers='10.128.184.167:9092')

    global count
    for msg in msgs:
        tmp = format_msg(msg)
        # print 'Send ==> ', tmp
        producer.send('topic_lpr', tmp.encode('utf8'))
        if count % 100 == 0:
            print u'==>[{}] {}'.format(count, tmp)
            producer.flush()
        count += 1

    producer.flush()
    producer.close(timeout=5)
Ejemplo n.º 24
0
class KeyedProducer(BaseStreamProducer):
    def __init__(self, location, topic_done, partitioner, compression):
        self._location = location
        self._topic_done = topic_done
        self._partitioner = partitioner
        self._compression = compression
        self._producer = KafkaProducer(bootstrap_servers=self._location, partitioner=partitioner, retries=5,
                                       compression_type=self._compression)

    def send(self, key, *messages):
        for msg in messages:
            self._producer.send(self._topic_done, key=key, value=msg)

    def flush(self):
        self._producer.flush()

    def get_offset(self, partition_id):
        pass
Ejemplo n.º 25
0
def try_send():

    producer = KafkaProducer(bootstrap_servers="ip-172-31-12-78.us-west-1.compute.internal:6667")
    # client = KafkaClient("ip-172-31-12-78.us-west-1.compute.internal:6667")
    # producer = SimpleProducer(client, async=True, batch_send_every_n = 100, batch_send_every_t = 60, random_start=False)
    # producer = SimpleProducer(client)
    # connect_str = 'ip-172-31-12-78.us-west-1.compute.internal:6667'
    # producer = KafkaProducer(bootstrap_servers=connect_str,
    #                         max_block_ms=10000,
    #                         value_serializer=str.encode)

    topic = '2008'
    with open('/home/ec2-user/data/2008.csv') as f:
        for line in f:
            producer.send(topic, line)

    producer.flush()
    producer.close()
Ejemplo n.º 26
0
class SimpleProducer(BaseStreamProducer):
    def __init__(self, location, topic, compression):
        self._location = location
        self._topic = topic
        self._compression = compression
        self._create()

    def _create(self):
        self._producer = KafkaProducer(bootstrap_servers=self._location, retries=5,
                                       compression_type=self._compression)

    def send(self, key, *messages):
        for msg in messages:
            self._producer.send(self._topic, value=msg)

    def flush(self):
        self._producer.flush()

    def close(self):
        self._producer.close()
Ejemplo n.º 27
0
class KafkaAccessLayer(object):

    def __init__(self):
        self.connection = None

    def connect(self, uri):
        try:
            def serializer(v):
                return json.dumps(v).encode('utf-8')
            self.connection = KafkaProducer(bootstrap_servers=uri,
                                            value_serializer=serializer)
        except Exception:
            raise Exception('Kafka connection error: {0}'.format(uri))

    def write_stats(self, id, name, stats, **kwargs):
        for stat in stats:
            msg = {'agent_id': id, 'process_name': name,
                   'timestamp': datetime.utcfromtimestamp(stat[0])
                   .strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
                   'cpu': stat[1], 'mem': stat[2]}
            self.connection.send('supervisor', msg)
        self.connection.flush()
Ejemplo n.º 28
0
class Kafka(Outgoing):
    def __init__(self, logger=None, destination=None, *args, **kwargs):
        if destination == "full_ipv4":
            self.topic = "ipv4"
        elif destination == "alexa_top1mil":
            self.topic = "domain"
        else:
            raise Exception("invalid destination: %s" % destination)
        host = os.environ.get("KAFKA_BOOTSTRAP_HOST", "localhost:9092")
        self.main_producer = KafkaProducer(bootstrap_servers=host)
        self.cert_producer = KafkaProducer(bootstrap_servers=host)

    def take(self, pbout):
        for certificate in pbout.certificates:
            self.cert_producer.send("certificate", certificate)
        self.main_producer.send(self.topic, pbout.transformed)

    def cleanup(self):
        if self.main_producer:
            self.main_producer.flush()
        if self.cert_producer:
            self.cert_producer.flush()
Ejemplo n.º 29
0
def test_end_to_end(kafka_broker, compression):

    if compression == 'lz4':
        # LZ4 requires 0.8.2
        if version() < (0, 8, 2):
            return
        # LZ4 python libs dont work on python2.6
        elif sys.version_info < (2, 7):
            return

    connect_str = 'localhost:' + str(kafka_broker.port)
    producer = KafkaProducer(bootstrap_servers=connect_str,
                             retries=5,
                             max_block_ms=10000,
                             compression_type=compression,
                             value_serializer=str.encode)
    consumer = KafkaConsumer(bootstrap_servers=connect_str,
                             group_id=None,
                             consumer_timeout_ms=10000,
                             auto_offset_reset='earliest',
                             value_deserializer=bytes.decode)

    topic = random_string(5)

    for i in range(1000):
        producer.send(topic, 'msg %d' % i)
    producer.flush(timeout=30)
    producer.close()

    consumer.subscribe([topic])
    msgs = set()
    for i in range(1000):
        try:
            msgs.add(next(consumer).value)
        except StopIteration:
            break

    assert msgs == set(['msg %d' % i for i in range(1000)])
Ejemplo n.º 30
0
class KeyedProducer(BaseStreamProducer):
    def __init__(self, location, enable_ssl, cert_path, topic_done, partitioner, compression, **kwargs):
        self._location = location
        self._topic_done = topic_done
        self._partitioner = partitioner
        self._compression = compression
        max_request_size = kwargs.pop('max_request_size', DEFAULT_MAX_REQUEST_SIZE)
        kwargs.update(_prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {})
        self._producer = KafkaProducer(bootstrap_servers=self._location,
                                       partitioner=partitioner,
                                       retries=5,
                                       compression_type=self._compression,
                                       max_request_size=max_request_size,
                                       **kwargs)

    def send(self, key, *messages):
        for msg in messages:
            self._producer.send(self._topic_done, key=key, value=msg)

    def flush(self):
        self._producer.flush()

    def get_offset(self, partition_id):
        pass
class InterceptionSender( threading.Thread ) :
    def __init__( self, userID, interceptionFilePath = "./", interceptionFileName = "interception.pcap" , senderTool : SenderTool = SenderTool.logstash, 
                  kafkaServerAddress = "localhost", kafkaServerPort = 9092, 
                  kafkaTopic = "interception_data", 
                  logstashAddress = "localhost", logstashPort = 5960, logstashMessageVersion = 1, 
                  sizeByteToRead = 100,
                  tcpServerAddress = "", tcpServerPort = 0 ) :
        threading.Thread.__init__( self )
        self.userID = userID
        self.interceptionFilePath = interceptionFilePath
        self.interceptionFileName = interceptionFileName
        self.senderTool = senderTool
        self.kafkaServerAddress = kafkaServerAddress
        self.kafkaServerPort = kafkaServerPort
        self.kafkaTopic = kafkaTopic
        self.logstashAddress = logstashAddress
        self.logstashPort = logstashPort
        self.logstashMessageVersion = logstashMessageVersion
        self.sizeByteToRead = sizeByteToRead
        self.tcpServerAddress = tcpServerAddress
        self.tcpServerPort = tcpServerPort
        self.tcpFirstMessage = None
        self.tcpSocket = None
        localLogger = MyLogger()
        self.logger = localLogger.getLogger( __name__ )
        self.logger.debug( "tcpServerAddress: %s / tcpServerPort: %s",
            str( self.tcpServerAddress ), str( self.tcpServerPort ) )
        self.is_active = True
        if self.senderTool == SenderTool.kafka :
            bootstrap_servers = [ str( self.kafkaServerAddress ) + ":" + str( self.kafkaServerPort ) ]
            self.logger.debug( "kafka bootstrap server: %s", str( bootstrap_servers ) )
            value_serializer = lambda x: dumps(x).encode( 'utf-8' )
            self.producer = KafkaProducer( bootstrap_servers = bootstrap_servers, value_serializer = value_serializer )
            self.__sender = self.__kafkaSender
        elif self.senderTool == SenderTool.logstash :
            self.logger.debug( "Logstash server : %s : %s",
                str( self.logstashAddress ), str( self.logstashPort ) )
            self.sendToLogstash = logging.getLogger( "interception" )
            if len( self.sendToLogstash.handlers ) == 0 :
               self.sendToLogstash.setLevel( logging.INFO )
               self.sendToLogstash.addHandler(
                       logstash.TCPLogstashHandler(
                           host = self.logstashAddress,
                           port = self.logstashPort,
                           version = self.logstashMessageVersion
                       )
                   )
            self.__sender = self.__logstashSender
        elif self.senderTool == SenderTool.tcpstream :
            self.logger.debug( "Tcp stream sender : tcp server : %s:%s", \
                str( self.tcpServerAddress ), str( self.tcpServerPort ) )
            self.tcpFirstMessage = True
            self.__sender = self.__tcpSender
            for res in socket.getaddrinfo( self.tcpServerAddress, 
                    self.tcpServerPort, socket.AF_UNSPEC, socket.SOCK_STREAM ) :
                af, socktype, proto, canonname, sa = res
                try :
                    self.tcpSocket = socket.socket( af, socktype, proto )
                except socket.error as exception :
                    self.logger.debug( "Error: create socket, retry : %s", str( exception ) )
                    self.tcpSocket = None
                    continue
                try :
                    self.tcpSocket.connect( sa )
                except socket.error as exception :
                    self.logger.debug( "Error: connect, retry : %s", str( exception ) )
                    self.tcpSocket.close()
                    self.tcpSocket = None
                    continue
                break
            if self.tcpSocket is None :
                self.logger.error( "ERROR: impossible to connect to %s:%s", \
                    str( self.tcpServerAddress ), str( self.tcpServerPort ) )

        else :
            self.__sender = self.__undefinedSender

    def sender( self, message, bytesToSend ) :
        self.__sender( message, bytesToSend )

    def __logstashSender( self, message, bytesToSend ) :
        self.sendToLogstash.info( message[ "data" ], extra = message )

    def __kafkaSender( self, message, bytesToSend ) :
        # message is a dictionary, lambda function transforms it in a Json payload
        self.producer.send( self.topic, message )
        # wait until all messages sent
        self.producer.flush()

    def __tcpSender( self, message, bytesToSend ) :
        if self.tcpSocket :
            if self.tcpFirstMessage :
                # send only name of file to write
                self.tcpFirstMessage = False
                self.tcpSocket.send( message[ "interceptionfilename" ].encode() )
                sleep( 1 )
            # send bytestream
            self.tcpSocket.send( bytesToSend )
        else :
            self.logger.error( "ERROR: socket is empty" )

    def __undefinedSender( self, message, bytesToSend ) :
        self.logger( "ERROR : sender tool not defined ")
    
    def run( self ) :
        self.logger.debug( "thread start")
        message = dict()
        message[ "userid" ] = self.userID
        message[ "data" ] = b'' # field for interception data
        message[ "interceptionfilename" ] = self.interceptionFileName
        interceptionCompletePath = str( self.interceptionFilePath ) \
            + "/" + str( self.interceptionFileName )

        # TODO : try to read the file 150 times before rise an error
        # 150 times for 2 seconds of sleep => 5 minutes of attempts
        isFileOpened = False
        openFileOperationCycle = 0
        while isFileOpened == False and self.is_active :
            try :
                file = open( interceptionCompletePath, "rb" )
                isFileOpened = True
                self.logger.debug( "File %s opened", str( interceptionCompletePath ) )
            except Exception as e :
                if openFileOperationCycle == 150 :
                    self.logger.error( "Impossible to open %s", str( interceptionCompletePath ) )
                    return
                openFileOperationCycle += 1
                if openFileOperationCycle % 10 :
                    self.logger.debug( "Impossible to open %s, retry in 2 sec", str( interceptionCompletePath ) )
                sleep( 2 )

        while self.is_active :
            # Continuously read interception file and send it to kafka broker or logstash
            bytes = file.read( self.sizeByteToRead )

            # if file is empty, wait until it will be filled up
            if bytes == b'' :
                sleep( 2 )
            else :
                message[ "data" ] = str( base64.b64encode( bytes ), 'utf-8' )
                self.sender( message, bytes )

    def stop( self ) :
        self.logger.debug( "stop task")
        self.is_active = False
        if self.tcpSocket :
            self.logger.debug( "close socket : %s", str( self.tcpSocket ) )
            self.tcpSocket.close()
Ejemplo n.º 32
0
from kafka import KafkaProducer
import time

HollywoodProducer = KafkaProducer(
    bootstrap_servers="kafka-193a4a83-fjodor-0b37.aivencloud.com:15104",
    security_protocol="SSL",
    ssl_cafile="ca.pem",
    ssl_certfile="service.cert",
    ssl_keyfile="service.key",
)


for e in range(0,230):
    message = "message number {}".format(e)
    print("Sending: {}".format(message))
HollywoodProducer.send("Test1234",message.encode("utf-8"))
time.sleep(0)
# Wait for all messages to be sent
HollywoodProducer.flush()
Ejemplo n.º 33
0
class Annoyed(BaseANN):
    def __init__(self, metric, n_trees=4):
        if metric == 'angular':
            metric_id = 1
        elif metric == 'euclidean':
            metric_id = 0
        else:
            raise NotImplementedError(
                f'AnnoyED doesn\'t support metric {metric}')
        self._metric = metric
        self.query_id = 0
        self.name = f'Annoyed(n_trees={n_trees})'
        headers = {'Content-Type': 'application/x-www-form-urlencoded'}
        response = requests.request("POST",
                                    URL,
                                    headers=headers,
                                    data=f'{n_trees};{300000};{metric_id}')
        print(response.text.encode('utf8'))
        print(n_trees)
        self._producer = KafkaProducer(bootstrap_servers=['localhost:9092'])
        self._consumer = KafkaConsumer('sink-topic',
                                       bootstrap_servers=['localhost:9092'])

    def publish_message(self, topic_name, key, value):
        try:
            key_bytes = bytes(key, encoding='utf-8')
            value_bytes = bytes(value, encoding='utf-8')
            self._producer.send(topic_name, key=key_bytes, value=value_bytes)
            self._producer.flush()
        except Exception as ex:
            print('Exception in publishing message')
            print(ex)

    def fit(self, X):
        self.start_time = time.time()
        self.stop_time = None
        for i, x in enumerate(X):
            datapoint = {
                'datapointID': i,
                'vector': x.tolist(),
                'persist': True,
                'write': False
            }
            self.publish_message('source-topic', f'{i}', json.dumps(datapoint))

    def query(self, q, n):
        datapoint = {
            'datapointID': -1,
            'vector': q.tolist(),
            'persist': False,
            'write': True,
            'k': n
        }
        self.publish_message('source-topic', f'Query Point {self.query_id}',
                             json.dumps(datapoint))
        self.query_id += 1
        msg = next(self._consumer)
        if not self.stop_time:
            self.stop_time = time.time()
            print('Fitting time:', self.stop_time - self.start_time)
        nn = json.loads(msg.value)
        ret = [int(dp) for dp in nn['list']]
        return ret

    def __str__(self):
        return self.name
Ejemplo n.º 34
0
from kafka import KafkaProducer
producer = KafkaProducer(bootstrap_servers='localhost:9092')

# Send some test events
producer.send('sample', b'Hello, CSYE7245!')
producer.send('sample', key=b'message-two', value=b'Kafka in use!')
producer.flush()
Ejemplo n.º 35
0
class ZmapKafkaProducer():
    """"""
    POLL_INTERVAL = 0.01  # seconds to wait before checking file

    def __init__(self, server, cert_path, topic_name, zmap_scan):
        """Initialize a ZmapKafkaProducer

        More to come.

        """
        self.server = server
        self.cert_path = cert_path
        self.file_path = zmap_scan.scanner.output_file
        self.scan_id = zmap_scan.scan_id
        self.scan_type = zmap_scan.type
        self.topic_name = topic_name
        self.producer = KafkaProducer(
            bootstrap_servers=self.server,
            security_protocol="SSL",
            ssl_cafile=self.cert_path + "/ca.pem",
            ssl_certfile=self.cert_path + "/service.cert",
            ssl_keyfile=self.cert_path + "/service.key",
        )

    async def __readline(self, f):
        """Hidden method to read a file asyncronously.

        """
        while True:
            data = f.readline()
            if data:
                return data
#             else:
#                 print("[INFO] ZmapKafkaProducer has no data to read.")
#                 break
            await asyncio.sleep(ZmapKafkaProducer.POLL_INTERVAL)

    async def run_async(self):

        if self.topic_name == '':
            print("[ERROR]: Kafka topic has not been set for this object.")
            quit()

        if self.file_path == '':
            print(
                "[ERROR]: The file to read has not been set for this object.")
            quit()

        if self.scan_id == '':
            print("[ERROR]: The scan_id has not been set for this object.")
            quit()

        if self.scan_type == '':
            print("[ERROR]: The scan_id has not been set for this object.")
            quit()

        msg_count = 0
        with open(self.file_path, mode='r+t', encoding='utf-8') as f:

            while True:
                line = await self.__readline(f)
                if line:
                    msg = json.loads(line)
                    msg['scan_id'] = self.scan_id
                    msg['scan_type'] = self.scan_type

                    # print a status message every 1000 cycles
                    if msg_count % 1000 == 0:
                        print("Sending message #: {:>10} | {}".format(
                            msg_count, line.strip()))
                    # send message to Kafka topic
                    self.producer.send(self.topic_name,
                                       json.dumps(msg).encode("utf-8"))
                    msg_count += 1
                else:
                    print("[INFO] ZmapKafkaProducer has no more data to send.")
                    break
        # Wait for all messages to be sent
        self.producer.flush()

    def settings(self):
        """Displays all the settings for this Kafka Producer"""
        return {
            'kafka_settings': {
                'server': self.server,
                'cert_path': self.cert_path,
                'topic_name': self.topic_name,
            },
            'file_path': self.file_path,
            'scan_id': self.scan_id,
            'poll_interval': str(self.POLL_INTERVAL)
        }
Ejemplo n.º 36
0
def kafka_produce(topic, messages):
    producer = KafkaProducer(bootstrap_servers="localhost:9092")
    for message in messages:
        producer.send(topic=topic, value=message)
        producer.flush()
    print("Produced {} messages for topic {}".format(len(messages), topic))
Ejemplo n.º 37
0
def main():

    ## the topic
    topic = sys.argv[1]

    ## create a Kafka producer with json serializer
    producer = KafkaProducer(
        value_serializer=lambda v: json.dumps(v).encode('utf-8'),
        bootstrap_servers=server)
    print "*** Starting measurements stream on " + server + ", topic : " + topic
    file = open('eventsToKafka.txt', 'w')

    try:

        while True:
            #read input file and send line by line to Kafka topic
            f = open('events.txt', 'r')
            while True:

                #remove wildcard '\n' and convert to a list
                line = f.readline().strip().split()
                if line == []:
                    f.close()
                    break
                else:
                    #     	  			id = line[0]
                    #     	  			seqId = line[1]
                    #     	  			eventId = line[2]
                    #     	  			attnum = line[3]
                    #     	  			userid = line[4]
                    #     	  			d = line[5]  #date
                    #     	  			t = line[6]  #time
                    #     	  			role = line[7]
                    #     	  			location = line[8]
                    #     	  			action = line[9]
                    #     	  			res = line[10]
                    #     	  			patientId = line[11]

                    meas2 = {
                        "userid": line[4],
                        "type": "TEMP",
                        "id": line[0],
                        "seqId": line[1],
                        "eventId": line[2],
                        "attnum": line[3],
                        "d": line[5],
                        "t": line[6],
                        "role": line[7],
                        "location": line[8],
                        "action": line[9],
                        "res": line[10],
                        "patientId": line[11]
                    }
                    producer.send(topic, meas2, key=b'%s' % line[4])
                    print "Sending TEMP : %s" % (
                        json.dumps(meas2).encode('utf-8'))
                    file.write("Sending TEMP : %s\n" %
                               (json.dumps(meas2).encode('utf-8')))

            sleep(1)

    except KeyboardInterrupt:
        pass

    file.close()
    print "\nIntercepted user interruption ..\nBlock until all pending messages are sent.."
    producer.flush()
Ejemplo n.º 38
0
    def run(self):
        """Publish video frames as json objects, timestamped, marked with camera number.

        Source:
            self.video_path: URL for streaming video
            self.kwargs["use_cv2"]: use raw cv2 streaming, set to false to use smart fast streaming --> not every frame is sent.
        Publishes:
            A dict {"frame": string(base64encodedarray), "dtype": obj.dtype.str, "shape": obj.shape,
                    "timestamp": time.time(), "camera": camera, "frame_num": frame_num}
        """

        if self.rr_distribute:
            partitioner = RoundRobinPartitioner(partitions=
                                                [TopicPartition(topic=self.frame_topic, partition=i)
                                                 for i in range(self.topic_partitions)])

        else:

            partitioner = Murmur2Partitioner(partitions=
                                             [TopicPartition(topic=self.frame_topic, partition=i)
                                              for i in range(self.topic_partitions)])

        # Producer object, set desired partitioner
        frame_producer = KafkaProducer(bootstrap_servers=["localhost:9092"],
                                       key_serializer=lambda key: str(key).encode(),
                                       value_serializer=lambda value: json.dumps(value).encode(),
                                       partitioner=partitioner)

        print("[CAM {}] URL: {}, SET PARTITIONS FOR FRAME TOPIC: {}".format(self.camera_num,
                                                                            self.video_path,
                                                                            frame_producer.partitions_for(
                                                                                self.frame_topic)))
        # Use either option
        video = cv2.VideoCapture(self.video_path) if self.use_cv2 else VideoStream(self.video_path).start()

        # Track frame number
        frame_num = 0
        start_time = time.time()
        print("[CAM {}] START TIME {}: ".format(self.camera_num, start_time))

        # Read URL, Transform, Publish
        while True:

            # using raw cv2, frame by frame
            if self.use_cv2:
                success, image = video.read()
                # check if the file has read
                if not success:
                    if self.verbose:
                        print("[CAM {}] URL: {}, END FRAME: {}".format(self.name,
                                                                       self.video_path,
                                                                       frame_num))
                    break

            # using smart, only unique frames, skips frames, faster fps
            else:
                image = video.read()
                # check if the file has read
                if image is None:
                    if self.verbose:
                        print("[CAM {}] URL: {}, END FRAME: {}".format(self.name,
                                                                       self.video_path,
                                                                       frame_num))
                    break

            # Attach metadata to frame, transform into JSON
            message = self.transform(frame=image,
                                     frame_num=frame_num,
                                     object_key=self.object_key,
                                     camera=self.camera_num,
                                     verbose=self.verbose)

            # Partition to be sent to
            part = frame_num % self.topic_partitions
            # Logging
            if self.verbose:
                print("\r[PRODUCER][Cam {}] FRAME: {} TO PARTITION: {}".format(message["camera"],
                                                                               frame_num, part), end="")
            # Publish to specific partition
            frame_producer.send(self.frame_topic, key="{}_{}".format(self.camera_num, frame_num), value=message)

            # if frame_num % 1000 == 0:
            frame_producer.flush()

            frame_num += 1

        # clear the capture
        if self.use_cv2:
            video.release()
        else:
            video.stop()

        if self.verbose:
            print("[CAM {}] FINISHED. STREAM TIME {}: ".format(self.camera_num, time.time() - start_time))

        return True if frame_num > 0 else False
Ejemplo n.º 39
0
class CheckKafka(PubSubNagiosPlugin):
    def __init__(self):
        # Python 2.x
        super(CheckKafka, self).__init__()
        # Python 3.x
        # super().__init__()
        self.name = 'Kafka'
        self.default_host = 'localhost'
        self.default_port = '9092'
        self.producer = None
        self.consumer = None
        self.topic = None
        self.client_id = 'Hari Sekhon {prog} {version}'.format(
            prog=os.path.basename(get_topfile()), version=__version__)
        self.group_id = '{client_id} {pid} {random}'.format(
            client_id=self.client_id, pid=os.getpid(), random=random_alnum(10))
        self.acks = '1'
        self.retries = 0
        self.partition = None
        self.topic_partition = None
        self.brokers = None
        self.timeout_ms = None
        self.start_offset = None
        self.sleep_secs = 0
        self.sleep_usage = 'Sleep in seconds between producing and consuming from given topic' + \
                           ' (optional, default: {} secs)'.format(self.default_sleep_secs)

    def add_options(self):
        # super(CheckKafka, self).add_options()
        self.add_opt('-B', '--brokers',
                     dest='brokers', metavar='broker_list',
                     help='Kafka Broker seed list in form host[:port],host2[:port2]... ' + \
                             '($KAFKA_BROKERS, $KAFKA_HOST:$KAFKA:PORT, default: localhost:9092)')
        self.add_opt('-H', '--host',
                     help='Kafka broker host, used to construct --brokers if not specified ' + \
                          '($KAFKA_HOST, default: {0})'.format(self.default_host))
        self.add_opt('-P', '--port',
                     help='Kafka broker port, used to construct --brokers if not specified ' + \
                          '($KAFKA_PORT, default: {0})'.format(self.default_port))
        self.add_opt('-T',
                     '--topic',
                     default=os.getenv('KAFKA_TOPIC'),
                     help='Kafka Topic ($KAFKA_TOPIC)')
        self.add_opt('-p',
                     '--partition',
                     type=int,
                     help='Kafka Partition (default: random)')
        self.add_opt(
            '-a',
            '--acks',
            default=1,
            choices=['1', 'all'],
            help=
            'Acks to require from Kafka. Valid options are \'1\' for Kafka ' +
            'partition leader, or \'all\' for all In-Sync Replicas (may block causing '
            + 'timeout if replicas aren\'t available, default: 1)')
        self.add_opt('-s',
                     '--sleep',
                     type=float,
                     default=1.0,
                     metavar='secs',
                     help=self.sleep_usage)
        self.add_opt('--list-topics',
                     action='store_true',
                     help='List Kafka topics from broker(s) and exit')
        self.add_opt('--list-partitions',
                     action='store_true',
                     help='List Kafka topic paritions from broker(s) and exit')
        self.add_thresholds(default_warning=1, default_critical=2)

    def process_broker_args(self):
        self.brokers = self.get_opt('brokers')
        host = self.get_opt('host')
        port = self.get_opt('port')
        host_env = os.getenv('KAFKA_HOST')
        port_env = os.getenv('KAFKA_PORT')
        if not host:
            # protect against blank strings in env vars
            if host_env:
                host = host_env
            else:
                host = self.default_host
        if not port:
            # protect against blank strings in env vars
            if port_env:
                port = port_env
            else:
                port = self.default_port
        brokers_env = os.getenv('KAFKA_BROKERS')
        if not self.brokers:
            if brokers_env:
                self.brokers = brokers_env
            else:
                self.brokers = '{0}:{1}'.format(host, port)
        brokers = ''
        for broker in self.brokers.split(','):
            if ':' not in broker:
                broker += ':{0}'.format(port)
            validate_hostport(broker)
            brokers += '{0}, '.format(broker)
        brokers = brokers.rstrip(', ')
        self.brokers = brokers
        log_option('brokers', self.brokers)

    def process_args(self):
        self.process_broker_args()
        self.timeout_ms = max((self.timeout * 1000 - 1000) / 2, 1000)
        sleep_secs = self.get_opt('sleep')
        if sleep_secs:
            # validation done through property wrapper
            self.sleep_secs = sleep_secs
            log_option('sleep', sleep_secs)
        try:
            list_topics = self.get_opt('list_topics')
            list_partitions = self.get_opt('list_partitions')
            if list_topics:
                self.print_topics()
                sys.exit(ERRORS['UNKNOWN'])
        except KafkaError:
            raise CriticalError(self.exception_msg())

        self.topic = self.get_opt('topic')
        if self.topic:
            validate_chars(self.topic, 'topic', r'\w\.-')
        elif list_topics or list_partitions:
            pass
        else:
            self.usage('--topic not specified')

        # because this could fail to retrieve partition metadata and we want it to throw CRITICAL if so
        try:
            self.process_partitions(list_partitions)
        except KafkaError:
            err = self.exception_msg()
            raise CriticalError(err)

        self.topic_partition = TopicPartition(self.topic, self.partition)
        self.acks = self.get_opt('acks')
        if self.acks == 'all':
            log_option('acks', self.acks)
        else:
            validate_int(self.acks, 'acks')
            self.acks = int(self.acks)
        self.validate_thresholds()

    def process_partitions(self, list_partitions=False):
        if list_partitions:
            if self.topic:
                self.print_topic_partitions(self.topic)
            else:
                for topic in self.get_topics():
                    self.print_topic_partitions(topic)
            sys.exit(ERRORS['UNKNOWN'])
        self.partition = self.get_opt('partition')
        # technically optional, will hash to a random partition, but need to know which partition to get offset
        if self.partition is None:
            log.info('partition not specified, getting random partition')
            self.partition = random.choice(
                list(self.get_topic_partitions(self.topic)))
            log.info('selected partition %s', self.partition)
        validate_int(self.partition, "partition", 0, 10000)

    def run(self):
        try:
            super(CheckKafka, self).run()
        #except KafkaError as _:
        #raise CriticalError(_)
        except KafkaError:
            err = self.exception_msg()
            raise CriticalError(err)

    def exception_msg(self):
        err = traceback.format_exc().split('\n')[-2]
        if 'NoBrokersAvailable' in err:
            err += ". Could not connect to Kafka broker(s) '{0}'".format(
                self.brokers)
        return err

    def get_topics(self):
        self.consumer = KafkaConsumer(
            bootstrap_servers=self.brokers,
            client_id=self.client_id,
            #request_timeout_ms=self.timeout_ms + 1, # must be larger than session timeout
            #session_timeout_ms=self.timeout_ms,
        )
        return self.consumer.topics()

    def print_topics(self):
        print('Kafka Topics:\n')
        for topic in self.get_topics():
            print(topic)

    def get_topic_partitions(self, topic):
        self.consumer = KafkaConsumer(
            topic,
            bootstrap_servers=self.brokers,
            client_id=self.client_id,
            #request_timeout_ms=self.timeout_ms
        )
        if topic not in self.get_topics():
            raise CriticalError(
                "topic '{0}' does not exist on Kafka broker".format(topic))
        partitions = self.consumer.partitions_for_topic(topic)
        if not isSet(partitions):
            raise UnknownError(
                'partitions returned type is {}, not a set as expected'.format(
                    type(partitions)))
        return partitions

    def print_topic_partitions(self, topic):
        print('Kafka topic \'{0}\' partitions:\n'.format(topic))
        #for partition in self.get_topic_partitions(topic):
        #    print(partition)
        print(list(self.get_topic_partitions(topic)))
        print()

    def subscribe(self):
        self.consumer = KafkaConsumer(
            #self.topic,
            bootstrap_servers=self.brokers,
            # client_id=self.client_id,
            # group_id=self.group_id,
            #request_timeout_ms=self.timeout_ms
        )
        #key_serializer
        #value_serializer
        # this is only a guess as Kafka doesn't expose it's API version
        #log.debug('kafka api version: %s', self.consumer.config['api_version'])
        log.debug('partition assignments: {0}'.format(
            self.consumer.assignment()))

        # log.debug('subscribing to topic \'{0}\' parition \'{1}\''.format(self.topic, self.partition))
        # self.consumer.subscribe(TopicPartition(self.topic, self.partition))
        # log.debug('partition assignments: {0}'.format(self.consumer.assignment()))

        log.debug('assigning partition {0} to consumer'.format(self.partition))
        # self.consumer.assign([self.partition])
        self.consumer.assign([self.topic_partition])
        log.debug('partition assignments: {0}'.format(
            self.consumer.assignment()))

        log.debug('getting current offset')
        # see also highwater, committed, seek_to_end
        self.start_offset = self.consumer.position(self.topic_partition)
        if self.start_offset is None:
            # don't do this, I've seen scenario where None is returned and all messages are read again, better to fail
            # log.warn('consumer position returned None, resetting to zero')
            # self.start_offset = 0
            raise UnknownError(
                'Kafka Consumer reported current starting offset = {0}'.format(
                    self.start_offset))
        log.debug('recorded starting offset \'{0}\''.format(self.start_offset))
        # self.consumer.pause()

    def publish(self):
        log.debug('creating producer')
        self.producer = KafkaProducer(
            bootstrap_servers=self.brokers,
            client_id=self.client_id,
            acks=self.acks,
            batch_size=0,
            max_block_ms=self.timeout_ms,
            #request_timeout_ms=self.timeout_ms + 1, # must be larger than session timeout
            #session_timeout_ms=self.timeout_ms,
        )
        #key_serializer
        #value_serializer
        log.debug('producer.send()')
        self.producer.send(self.topic,
                           key=self.key.encode('utf-8'),
                           partition=self.partition,
                           value=self.publish_message.encode('utf-8'))
        log.debug('producer.flush()')
        self.producer.flush()

    def consume(self):
        self.consumer.assign([self.topic_partition])
        log.debug('consumer.seek({0})'.format(self.start_offset))
        self.consumer.seek(self.topic_partition, self.start_offset)
        # self.consumer.resume()
        log.debug('consumer.poll(timeout_ms={0})'.format(self.timeout_ms))
        obj = self.consumer.poll(timeout_ms=self.timeout_ms)
        log.debug('msg object returned: %s', obj)
        msg = None
        try:
            for consumer_record in obj[self.topic_partition]:
                if consumer_record.key == self.key.encode('utf-8'):
                    msg = consumer_record.value.decode('utf-8')
                    break
        except KeyError:
            raise UnknownError('TopicPartition key was not found in response')
        if msg is None:
            raise UnknownError(
                "failed to find matching consumer record with key '{0}'".
                format(self.key))
        return msg
    bootstrap_servers='localhost:9092',
    value_serializer=lambda x: dumps(x).encode('utf-8'))


def processData(ticker):
    ts = TimeSeries(key=api_key, output_format='pandas')
    intraData, meta_data = ts.get_intraday(symbol=ticker,
                                           interval='5min',
                                           outputsize='compact')
    #Remove enumeration from col names
    for column in intraData.columns:
        intraData.rename({column: column.split('. ')[1]}, axis=1, inplace=True)
    return intraData


data = processData('IBM')

for ind in data.index:
    stock = {}
    #   stock['date'] = data['date'][ind]
    stock['date'] = str(ind)
    stock['open'] = data['open'][ind]
    stock['high'] = data['high'][ind]
    stock['low'] = data['low'][ind]
    stock['close'] = data['close'][ind]
    stock['volume'] = data['volume'][ind]
    print("stock to be sent: ", stock)
    kafka_producer_obj.send("capstone3", stock)
    kafka_producer_obj.flush()
    time.sleep(1)
Ejemplo n.º 41
0
class KafkaClient(object):
    def __init__(self, kafka_hosts, ssl=False, username=None, password=None):
        """
        Initializes the Kafka client
        Args:
            kafka_hosts (list): A list of Kafka hostnames
            (with optional port numbers)
            ssl (bool): Use a SSL/TLS connection
            username (str): An optional username
            password (str):  An optional password

        Notes:
            ``use_ssl=True`` is implied when a username or password are
            supplied.

            When using Azure Event Hubs, the username is literally
            ``$ConnectionString``, and the password is the
            Azure Event Hub connection string.
        """
        config = dict(value_serializer=lambda v: json.dumps(v).encode('utf-8'),
                      bootstrap_servers=kafka_hosts,
                      client_id="parsedmarc-{0}".format(__version__))
        if ssl or username or password:
            config["security_protocol"] = "SSL"
            config["ssl_context"] = create_default_context()
            if username or password:
                config["sasl_plain_username"] = username or ""
                config["sasl_plain_password"] = password or ""
        try:
            self.producer = KafkaProducer(**config)
        except NoBrokersAvailable:
            raise KafkaError("No Kafka brokers available")

    @staticmethod
    def strip_metadata(report):
        """
          Duplicates org_name, org_email and report_id into JSON root
          and removes report_metadata key to bring it more inline
          with Elastic output.
        """
        report['org_name'] = report['report_metadata']['org_name']
        report['org_email'] = report['report_metadata']['org_email']
        report['report_id'] = report['report_metadata']['report_id']
        report.pop('report_metadata')

        return report

    @staticmethod
    def generate_daterange(report):
        """
        Creates a date_range timestamp with format YYYY-MM-DD-T-HH:MM:SS
        based on begin and end dates for easier parsing in Kibana.

        Move to utils to avoid duplication w/ elastic?
        """

        metadata = report["report_metadata"]
        begin_date = human_timestamp_to_datetime(metadata["begin_date"])
        end_date = human_timestamp_to_datetime(metadata["end_date"])
        begin_date_human = begin_date.strftime("%Y-%m-%dT%H:%M:%S")
        end_date_human = end_date.strftime("%Y-%m-%dT%H:%M:%S")
        date_range = [begin_date_human, end_date_human]
        logger.debug("date_range is {}".format(date_range))
        return date_range

    def save_aggregate_reports_to_kafka(self, aggregate_reports,
                                        aggregate_topic):
        """
        Saves aggregate DMARC reports to Kafka

        Args:
            aggregate_reports (list):  A list of aggregate report dictionaries
            to save to Kafka
            aggregate_topic (str): The name of the Kafka topic

        """
        if (type(aggregate_reports) == dict
                or type(aggregate_reports) == OrderedDict):
            aggregate_reports = [aggregate_reports]

        if len(aggregate_reports) < 1:
            return

        for report in aggregate_reports:
            report['date_range'] = self.generate_daterange(report)
            report = self.strip_metadata(report)

            for slice in report['records']:
                slice['date_range'] = report['date_range']
                slice['org_name'] = report['org_name']
                slice['org_email'] = report['org_email']
                slice['policy_published'] = report['policy_published']
                slice['report_id'] = report['report_id']
                logger.debug("Sending slice.")
                try:
                    logger.debug("Saving aggregate report to Kafka")
                    self.producer.send(aggregate_topic, slice)
                except UnknownTopicOrPartitionError:
                    raise KafkaError(
                        "Kafka error: Unknown topic or partition on broker")
                except Exception as e:
                    raise KafkaError("Kafka error: {0}".format(e.__str__()))
                try:
                    self.producer.flush()
                except Exception as e:
                    raise KafkaError("Kafka error: {0}".format(e.__str__()))

    def save_forensic_reports_to_kafka(self, forensic_reports, forensic_topic):
        """
        Saves forensic DMARC reports to Kafka, sends individual
        records (slices) since Kafka requires messages to be <= 1MB
        by default.

        Args:
            forensic_reports (list):  A list of forensic report dicts
            to save to Kafka
            forensic_topic (str): The name of the Kafka topic

        """
        if type(forensic_reports) == dict:
            forensic_reports = [forensic_reports]

        if len(forensic_reports) < 1:
            return

        try:
            logger.debug("Saving forensic reports to Kafka")
            self.producer.send(forensic_topic, forensic_reports)
        except UnknownTopicOrPartitionError:
            raise KafkaError(
                "Kafka error: Unknown topic or partition on broker")
        except Exception as e:
            raise KafkaError("Kafka error: {0}".format(e.__str__()))
        try:
            self.producer.flush()
        except Exception as e:
            raise KafkaError("Kafka error: {0}".format(e.__str__()))
Ejemplo n.º 42
0
logger = logging.getLogger('For kafka Ktable topic')
logger.setLevel(logging.INFO)

csvfile = open('your_csv_file_path', 'r')

fieldnames = ('id', 'listing_url', 'name', 'transit', 'host_name',
              'neighbourhood', 'neighbourhood_cleansed',
              'neighbourhood_group_cleansed', 'city', 'state', 'zipcode',
              'country_code', 'country', 'latitude', 'longitude',
              'property_type', 'room_type', 'security_deposit', 'cleaning_fee',
              'minimum_nights', 'maximum_nights', 'calendar_last_scraped',
              'number_of_reviews', 'review_scores_rating',
              'review_scores_accuracy', 'review_scores_cleanliness',
              'review_scores_checkin', 'review_scores_communication',
              'review_scores_location', 'review_scores_value',
              'cancellation_policy', 'reviews_per_month')

reader = csv.DictReader(csvfile, fieldnames)
producer = KafkaProducer(bootstrap_servers='localhost:9092')

for row in reader:
    id = row['id']
    del row['id']
    data = json.dumps(row)
    producer.send(topic='detailed_listing', key=id, value=data)
    print('Keep writing data to kafka')

logger.info('Closing KafkaProducer')
producer.flush(10)
producer.close(10)
logger.info('KafkaProducer closed')
Ejemplo n.º 43
0
    network_filter = args.network_filter  # Filter for network for detection (regex filtering), e.g. "10\.10\..+"

    # Spark context initialization
    sc = SparkContext(
        appName=application_name + " " +
        " ".join(sys.argv[1:]))  # Application name used as the appName
    ssc = StreamingContext(sc, 1)  # Spark microbatch is 1 second

    # Initialize input DStream of flows from specified Zookeeper server and Kafka topic
    input_stream = KafkaUtils.createStream(
        ssc, args.input_zookeeper, "spark-consumer-" + application_name,
        {args.input_topic: kafka_partitions})

    # Run the detection of ddos
    ddos_result = inspect_ddos(input_stream)

    # Initialize kafka producer
    kafka_producer = KafkaProducer(bootstrap_servers=args.output_zookeeper,
                                   client_id="spark-producer-" +
                                   application_name)

    # Process the results of the detection and send them to the specified host
    ddos_result.foreachRDD(
        lambda rdd: print_and_send(rdd, kafka_producer, args.output_topic))

    # Send any remaining buffered records
    kafka_producer.flush()

    # Start input data processing
    ssc.start()
    ssc.awaitTermination()
Ejemplo n.º 44
0
def ShowStrains():
	datta = requests.get("http://strainapi.evanbusse.com/KPriyQg/strains/search/all")
	data = datta.json()
	producer = KafkaProducer(bootstrap_servers = ['localhost:9092'],value_serializer=lambda m: json.dumps(m).encode('ascii'))
	producer.send(TOPIC, data)
	producer.flush()
class ProducerServer:
    """A Kafka Producer used to simulate SF Crime data

    Attributes:
        bootstrap_servers (str): comma-separated Kafka brokers, e.g. localhost:9092,localhost:9093
        input_file (str): path to the data file for simulation
        topic_name (str): Kafka topic name
        key_serializer (Callable): function used to serialize the message key into bytes
        value_serializer (Callable): function used to serialize the message value into bytes
        num_partitions (int): number of partitions for the topic
        replication_factor (int): number of replications for the topic
        **conf: other key word arguments passed to kafka.KafkaProducer
    """
    def __init__(
        self,
        bootstrap_servers: str,
        input_file: str,
        topic_name: str,
        key_serializer: Callable = str.encode,
        value_serializer: Callable = JsonSerializer().serialize,
        num_partitions: int = 3,
        replication_factor: int = 1,
        **conf,
    ):
        self.bootstrap_servers = bootstrap_servers
        self.input_file = input_file
        self.topic_name = topic_name
        self.num_partitions = num_partitions
        self.replication_factor = replication_factor
        self._key_serializer = key_serializer
        self._value_serializer = value_serializer
        self.conf: dict = conf
        self.producer = KafkaProducer(key_serializer=key_serializer,
                                      value_serializer=value_serializer,
                                      **conf)

    @cached_property
    def client(self) -> KafkaAdminClient:
        """KafkaAdminClinet to manage topics and other cluster metadata"""
        bootstrap_servers: List[str] = self.bootstrap_servers.split(",")
        admin_client = KafkaAdminClient(bootstrap_servers=bootstrap_servers,
                                        client_id=self.conf.get("client_id"))
        return admin_client

    def create_topic(self):
        """Create Kafka topic on the brokers"""
        new_topic = NewTopic(
            name=self.topic_name,
            num_partitions=self.num_partitions,
            replication_factor=self.replication_factor,
        )

        try:
            resp = self.client.create_topics([new_topic], timeout_ms=10000)
        except TopicAlreadyExistsError:
            logger.info(f"Topic already exists: {new_topic.name}")
        else:
            for topic_name, err_code, err_msg in resp.topic_errors:
                if err_code != 0:
                    raise f"Error Code [{err_code}] when creating {topic_name}: {err_msg}"
            logger.info(f"Topic created: {topic_name}")
        finally:
            self.client.close()

    def generate_data(self):
        """Iterate the JSON data and send it to the Kafka Topic"""
        data = self.read_data()
        for record in data:
            key = record.get("crime_id")

            logger.debug(f"Message| key={key} | value={record}")
            future = self.producer.send(topic=self.topic_name,
                                        key=key,
                                        value=record)
            future.add_callback(self._on_success).add_errback(self._on_err)
            time.sleep(random.random())

    def read_data(self) -> dict:
        """Load in a JSON data file"""
        with open(self.input_file) as json_file:
            return json.load(json_file)

    def close(self):
        """Flush out all buffered messages and close down the producer gracefully"""
        self.producer.flush(timeout=10)
        self.producer.close(timeout=10)

    def _on_success(self, record_metadata):
        """Message on sucess callback"""
        logger.debug(
            f"Successful delivery - {record_metadata.topic}[{record_metadata.partition}]:{record_metadata.offset}"
        )

    def _on_err(self, exc):
        """Message on error callback"""
        logger.error(exc)
Ejemplo n.º 46
0
def training_online():
    log = logging.getLogger(__name__)
    client_id = f'client_{os.uname().machine}_{hex(os.getpid())}'
    consumer = KafkaConsumer(
        'desconhecidos',
        'clusters',
        bootstrap_servers='localhost:9092,localhost:9093,localhost:9094',
        group_id='training_online',
        client_id=client_id,
        value_deserializer=msgpack.unpackb,
        key_deserializer=msgpack.unpackb,
        # StopIteration if no message after 1 sec
        # consumer_timeout_ms=10 * 1000,
        # max_poll_records=10,
        auto_offset_reset='latest',
    )
    kprod = KafkaProducer(
        bootstrap_servers='localhost:9092,localhost:9093,localhost:9094',
        value_serializer=msgpack.packb,
        key_serializer=msgpack.packb,
    )
    #
    unknownBuffer = []
    sleepClusters = []
    noveltyIndex = 0
    clusters = []
    centers = []
    counter = 0
    elapsed = 0
    totalTime = time.time()
    log.info('READY')
    try:
        source = __name__
        clusters = req_block(log, consumer, kprod, client_id, source)
        log.info(f'READY {client_id}')
        for message in consumer:
            # message{ topic, partition, offset, key, value }
            if message.topic == 'clusters' and b'clusters' in message.value and b'source' in message.value and message.value[
                    b'source'] == 'offline':
                remoteClusters = decodeClusters(message.value[b'clusters'])
                # centers = mkCenters(clusters)
                # log.info(f'got clusters {len(clusters)}')
                continue
            if message.topic != 'desconhecidos':
                continue
            #
            init = time.time()
            example_decoded = {
                k.decode(encoding='utf-8'): v
                for k, v in message.value[b'example'].items()
            }
            if example_decoded['label'] is not None:
                example_decoded['label'] = example_decoded['label'].decode(
                    encoding='utf-8')
            example = Example(**example_decoded)
            example.item = np.array(example.item)
            unknownBuffer.append(example)
            counter += 1
            if counter % 10 == 0:
                log.info(f'unknownBuffer {counter}')
            if len(clusters) == 0:
                elapsed += time.time() - init
                continue
            #
            classified = []
            recurence = []
            extensions = []
            novelty = []
            #
            if len(unknownBuffer) < BUFF_FULL:
                elapsed += time.time() - init
                continue
            #
            # log.info('unknownBuffer > BUFF_FULL')
            if len(sleepClusters) > 0:
                sleepClustersCenters = mkCenters(sleepClusters)
                if len(sleepClustersCenters) > 0:
                    recurenceDetection(log, clusters, sleepClusters,
                                       sleepClustersCenters, unknownBuffer,
                                       classified, recurence, counter)
                else:
                    log.info('\n\n\tsleep Clusters Centers WARN')
                    log.info(sleepClusters)
                    log.info(sleepClustersCenters)
                    log.info('\n\n')
                #
            #
            if len(unknownBuffer) % (BUFF_FULL // 10) == 0:
                noveltyIndex = noveltyDetection(log, clusters, sleepClusters,
                                                unknownBuffer, classified,
                                                extensions, noveltyIndex,
                                                novelty, counter)
            if counter % CLEANUP_WINDOW == 0:
                # yield '[cleanup]'
                log.info(f'[cleanup] {counter}')
                for ex in unknownBuffer:
                    if counter - ex.n < 3 * CLEANUP_WINDOW:
                        unknownBuffer.remove(ex)
                for cl in clusters:
                    if counter - cl.latest < 2 * CLEANUP_WINDOW:
                        sleepClusters.append(cl)
                        clusters.remove(cl)
                if len(clusters) == 0:
                    # yield f'[fallback] {len(sleepClusters)} => clusters'
                    log.info(f'[fallback] {len(sleepClusters)} => clusters')
                    # fallback
                    clusters.extend(sleepClusters)
                    sleepClusters.clear()
                #
            #
            if len(classified) > 0:
                classe_contagem = {}
                for ex in classified:
                    if not ex.label in classe_contagem:
                        classe_contagem[ex.label] = 0
                    classe_contagem[ex.label] += 1
                sortedKeys = sorted(classe_contagem,
                                    key=lambda x: x if type(x) == str else '')
                classe_contagem = {k: classe_contagem[k] for k in sortedKeys}
                value = {
                    'classe-contagem': classe_contagem,
                    'nbytes': example.item.nbytes,
                    'source': 'online'
                }
                kprod.send(topic='classe-contagem',
                           value=value,
                           key=message.key)
            if len(recurence) == 0 and len(extensions) == 0 and len(
                    novelty) == 0:
                elapsed += time.time() - init
                continue
            # BROADCAST clusters
            clusters_serial = [c.__getstate__() for c in clusters]
            value = {'source': 'online', 'clusters': clusters_serial}
            kprod.send(topic='clusters', value=value, key=message.key)
            #
            clusters_serial = [c.__getstate__() for c in clusters]
            value = {'source': 'online', 'clusters': clusters_serial}
            kprod.send(topic='novidades', value=value, key=message.key)

            classified
            recurence
            extensions
            novelty
            elapsed += time.time() - init
        #
    except KeyboardInterrupt:
        pass
    except Exception as ex:
        log.exception(ex)
        raise
    finally:
        speed = counter // max(0.001, elapsed)
        elapsed = int(elapsed)
        log.info(
            f'DONE {elapsed} s, consumed {counter} items, {speed} i/s, {time.time() - totalTime}'
        )
        kprod.flush()
    #


#

# def kafka2gen(kafkaConsumer):
#     for record in kafkaConsumer:
#         yield record

# def online():
#     minasOffline
#     minas = MinasBase()
#     minas.
Ejemplo n.º 47
0
class InferenceCache(object):
    '''
    Caches queries & predictions to facilitate communication between predictor & inference workers.
    '''
    def __init__(self,
                 hosts=os.environ.get('KAFKA_HOST', 'localhost'),
                 ports=os.environ.get('KAFKA_PORT', 9092)):
        hostlist = hosts.split(',')
        portlist = ports.split(',')
        self.connection_url = [
            f'{host}:{port}' for host, port in zip(hostlist, portlist)
        ]
        self.producer = KafkaProducer(bootstrap_servers=self.connection_url,
                                      max_request_size=134217728,
                                      buffer_memory=134217728)

    def add_predictions_for_worker(self, worker_id: str,
                                   predictions: List[Prediction]):
        logger.info(
            f'Adding {len(predictions)} prediction(s) for worker "{worker_id}"'
        )

        for prediction in predictions:
            name = f'workers_{worker_id}_{prediction.query_id}_prediction'
            prediction = pickle.dumps(prediction)
            self.producer.send(name,
                               key=name.encode('utf-8'),
                               value=prediction)
            self.producer.flush()

    def take_prediction_for_worker(self, worker_id: str,
                                   query_id: str) -> Union[Prediction, None]:
        name = f'workers_{worker_id}_{query_id}_prediction'

        prediction_consumer = KafkaConsumer(
            name,
            bootstrap_servers=self.connection_url,
            auto_offset_reset='earliest',
            group_id=PREDICTIONS_QUEUE)
        prediction = None
        try:
            prediction = next(prediction_consumer).value
            prediction_consumer.commit()
            prediction = pickle.loads(prediction)
        except KafkaError:
            pass
        prediction_consumer.close()
        logger.info(
            f'Took prediction for query "{query_id}" from worker "{worker_id}"'
        )
        return prediction

    def add_queries_for_worker(self, worker_id: str, queries: List[Query]):
        name = f'workers_{worker_id}_queries'
        queries = [pickle.dumps(x) for x in queries]
        logger.info(
            f'Adding {len(queries)} querie(s) for worker "{worker_id}"...')
        for query in queries:
            self.producer.send(name, key=name.encode('utf-8'), value=query)
            self.producer.flush()

    def pop_queries_for_worker(self, worker_id: str,
                               batch_size: int) -> List[Query]:
        name = f'workers_{worker_id}_queries'

        RETRY_TIMES = 4
        while True:
            try:
                query_consumer = KafkaConsumer(
                    name,
                    bootstrap_servers=self.connection_url,
                    auto_offset_reset='earliest',
                    group_id=QUERIES_QUEUE)
                break
            except Exception as e:
                logger.error('Kafka conn Error, retry: {}'.format(RETRY_TIMES))
                logger.error(traceback.format_exc())
                RETRY_TIMES -= 1
                time.sleep(1)
                if RETRY_TIMES <= 0:
                    raise

        partition = TopicPartition(name, 0)
        partitiondic = query_consumer.end_offsets([partition])
        offsetend = partitiondic.get(partition, None)
        if offsetend == 0:
            query_consumer.close()
            return []
        try:
            queries = []
            while True:
                record = next(query_consumer)
                queries.append(record.value)
                query_consumer.commit()
                if record.offset >= offsetend - 1 or len(
                        queries) == batch_size:
                    break

            queries = [pickle.loads(x) for x in queries]
            query_consumer.close()
            return queries
        except KafkaError:
            query_consumer.close()
            return []

    def __del__(self):
        self.producer.close()
Ejemplo n.º 48
0
import json
from bson import json_util
import yaml

from kafka import KafkaProducer

p = KafkaProducer(bootstrap_servers='localhost:9092')

with open('dataset.json') as f:
    data = json.load(f)
    for x in data:
        p.send('sample',
               json.dumps(x, default=json_util.default).encode('utf-8'))
p.flush()
Ejemplo n.º 49
0
class ReplayCerebralCortexData:
    def __init__(self, config):
        """
        Constructor
        :param configuration:
        """
        self.config = config
        self.sqlData = SqlData(config)
        self.blacklist_regex = self.config["blacklist"]
        self.replay_type = self.config["data_replay"]["replay_type"]
        self.kafka_broker = self.config["kafkaserver"]["host"]
        self.data_dir = self.config["data_replay"]["data_dir"]
        if (self.data_dir[-1] != '/'):
            self.data_dir += '/'

        self.producer = KafkaProducer(bootstrap_servers=self.kafka_broker, api_version=(0, 10),
                                      value_serializer=lambda v: json.dumps(v).encode('utf-8'))

        participant_ids = []
        if str(config["users"]["uuids"]).strip()!="":
            participant_ids = str(config["users"]["uuids"]).split(",")

        if self.replay_type=="filez":
            self.read_data_dir(participant_ids)
        elif self.replay_type=="mydb":
            self.db_data(participant_ids)
        else:
            raise ValueError("Replay type can only be filez or mydb")

    def db_data(self, participant_ids):
        results = self.sqlData.get_data(participant_ids, self.blacklist_regex)
        if len(results)>0:
            for row in results:
                files_list = []
                for f in json.loads(row["files_list"]):
                    files_list.append(self.data_dir+f)
                self.produce_kafka_message({"user_id": row["owner_id"], "day": row["day"], "stream_id": row["stream_id"], "files_list": files_list})
            self.producer.flush()
        else:
            print("No record. You may need to run store_dirs_to_db.py if you want to use mydb data replay type.")

    def read_data_dir(self, participant_ids):
        data_dir = []
        if len(participant_ids)>0:
            for participant_id in participant_ids:
                data_dir.append(self.data_dir+participant_id.strip())
        else:
            data_dir = [entry.path for entry in os.scandir(self.data_dir) if entry.is_dir()]
        for participant in data_dir:
            for day_dir in os.scandir(participant):
                if day_dir.is_dir():
                    for stream_dir in os.scandir(day_dir):
                        if stream_dir.is_dir():
                            stream_dir = stream_dir.path
                            tmp = stream_dir.split("/")[-3:]
                            user_id  = tmp[0]
                            day = tmp[1]
                            stream_id = tmp[2]
                            files_list = []
                            for f in os.listdir(stream_dir):
                                if f.endswith(".gz"):
                                    files_list.append(stream_dir+"/"+f)
                            self.produce_kafka_message({"user_id": user_id, "day": day, "stream_id": stream_id, "files_list": files_list})

    def produce_kafka_message(self, filename):
        metadata = ""

        base_dir_path = self.data_dir.replace(filename["user_id"],"")
        day = filename["day"]

        if filename["files_list"][0]:
            metadata_filename = filename["files_list"][0].replace(".gz", ".json")
            metadata_file = open(metadata_filename, 'r')
            metadata = metadata_file.read()
            metadata_file.close()
            try:
                metadata = json.loads(metadata)
            except:
                metadata = metadata

        files_list = ','.join(filename["files_list"])
        files_list = files_list.replace(base_dir_path, "")

        self.producer.send("hdfs_filequeue", {"metadata": metadata, "day":day, "filename": files_list})

        print("Yielding file:", filename["files_list"][0])
Ejemplo n.º 50
0
    def run(self):
        """Publish video frames as json objects, timestamped, marked with camera number.
        Source:
            self.video_path: URL for streaming video
            self.kwargs["use_cv2"]: use raw cv2 streaming, set to false to use smart fast streaming --> not every frame is sent.
        Publishes:
            A dict {"frame": string(base64encodedarray), "dtype": obj.dtype.str, "shape": obj.shape,
                    "timestamp": time.time(), "camera": camera, "frame_num": frame_num}
        """
        if self.rr_distribute:
            partitioner = RoundRobinPartitioner(partitions=[
                TopicPartition(topic=self.frame_topic, partition=i)
                for i in range(self.topic_partitions)
            ])
        else:
            partitioner = Murmur2Partitioner(partitions=[
                TopicPartition(topic=self.frame_topic, partition=i)
                for i in range(self.topic_partitions)
            ])

        # Producer object, set desired partitioner
        frame_producer = KafkaProducer(
            bootstrap_servers=[params.KAFKA_BROKER],
            key_serializer=lambda key: str(key).encode(),
            value_serializer=lambda value: json.dumps(value).encode(),
            partitioner=partitioner,
            max_request_size=134217728)

        print("[CAM {}] URL: {}, SET PARTITIONS FOR FRAME TOPIC: {}".format(
            self.camera_num, self.video_path,
            frame_producer.partitions_for(self.frame_topic)))
        # Use either option
        if self.use_cv2:
            # video = cv2.VideoCapture(self.video_path)
            # Here we use sampler to read all videos from a folder
            self.sampler.add_video(self.video_path)
        else:
            video = VideoStream(self.video_path).start()

        # Track frame number
        frame_num = 0
        start_time = time.time()
        print("[CAM {}] START TIME {}: ".format(self.camera_num, start_time))

        while True:
            if self.use_cv2:
                success, image, self.location = self.sampler.read()
                if not success:
                    if self.verbose:
                        print("[CAM {}] URL: {}, END FRAME: {}".format(
                            self.name, self.video_path, frame_num))
                    break
            else:
                image = video.read()
                if image is None:
                    if self.verbose:
                        print("[CAM {}] URL: {}, END FRAME: {}".format(
                            self.name, self.video_path, frame_num))
                    break
            # Attach metadata to frame, transform into JSON
            message = self.transform(frame=image,
                                     frame_num=frame_num,
                                     location=self.location,
                                     object_key=self.object_key,
                                     camera=self.camera_num,
                                     verbose=self.verbose)
            self.sizecnt += 1
            if time.time() - self.timer > self.report_range:
                acc = self.sizecnt
                #if self.verbose:
                print("[Cam {}]Minute {} send out size {}".format(
                    self.camera_num,
                    int(self.timer - self.zerotime) // self.report_range, acc))
                self.sizecnt = 0
                self.timer = time.time()

            # Callback function
            def on_send_success(record_metadata):
                print(record_metadata.topic)
                print(record_metadata.partition)
                print(record_metadata.offset)

            def on_send_error(excp):
                print(excp)
                # log.error('I am an errback', exc_info=excp)

            #  Partition to be sent to
            part = frame_num % self.topic_partitions
            # Logging
            # Publish to specific partition
            if self.verbose:
                print("\r[PRODUCER][Cam {}] FRAME: {} TO PARTITION: {}".format(
                    message["camera"], frame_num, part))
                frame_producer.send(
                    self.frame_topic,
                    key="{}_{}".format(self.camera_num, frame_num),
                    value=message).add_callback(on_send_success).add_errback(
                        on_send_error)
            else:
                frame_producer.send(self.frame_topic,
                                    key="{}_{}".format(self.camera_num,
                                                       frame_num),
                                    value=message)

            # if frame_num % 1000 == 0:
            frame_producer.flush()

            frame_num += 1

        if self.use_cv2:
            self.sampler.release()
        else:
            video.stop()

        if self.verbose:
            print("[CAM {}] FINISHED. STREAM TIME {}: ".format(
                self.camera_num,
                time.time() - start_time))

        return True if frame_num > 0 else False
Ejemplo n.º 51
0
def main():
    # parse topic_ids
    topic_indices = _parse_topic_indices(topic_ids)
    
    s3 = boto3.resource('s3')
    
    producer = KafkaProducer(bootstrap_servers=bootstrap_servers,
                             acks=1, linger_ms=10,
                             batch_size=786432)
    print(bootstrap_servers)
    
    num_of_messages = 0
    lines = {}
    topic_offsets = [None] * 8000  # there will be upto 8000 topics
    is_topic_produced = [False] * 8000
    fileName_idx_min_width = 6
    is_message_production_started = False
    
    while True:
        line_idx_neg_check = 0
        for topic_idx in topic_indices:
            if topic_offsets[topic_idx] is None:
                
                # get fileName
                file_name = file_name_format.format(
                    idx_str=str(topic_idx).zfill(fileName_idx_min_width)
                )
                s3_obj = s3.Object(bucket_name, file_name)
                print(bucket_name, '/', file_name)
                
                # read files from s3
                obj_body = s3_obj.get()['Body'].read().decode('utf-8')
                lines[topic_idx] = obj_body.split('\n')
                topic_offsets[topic_idx] = -1
            
            elif is_topic_produced[topic_idx] is False:
                # produce messages to each topic
                topic_offsets[topic_idx] += 1
                
                line = lines[topic_idx][topic_offsets[topic_idx]]
                line_obj = json.loads(line)
                topic = line_obj['topic']
                
                producer.send(topic, value=line.encode('utf-8'))
                
                num_of_messages += 1
                is_message_production_started = True
                line_idx_neg = int(line_obj['segment_meta']['index_neg'])
                
                if line_idx_neg == -1:
                    TopicAllProduced[topic_idx] = True
                
                # Sleep for 1 ms after every 500 messages produced.
                if nMsg % 500 == 0:
                    time.sleep(0.001)
                
                if line_idx_neg < line_idx_neg_check:
                    line_idx_neg_check = line_idx_neg
            
        # end for
        
        if is_message_production_started:
            print('At most ' + str(-line_idx_neg_check - 1) +
                  ' messages per topic left to produce.')
            
        # Break the while loop, if no more messages to produce
            if line_idx_neg_check == -1:
                break
    
    producer.flush()
    print(str(nMsg) + ' lines produced.')
Ejemplo n.º 52
0
class SchemaBackup:
    def __init__(self,
                 config: Config,
                 backup_path: str,
                 topic_option: Optional[str] = None) -> None:
        self.config = config
        self.backup_location = backup_path
        self.topic_name = topic_option or self.config["topic_name"]
        self.log = logging.getLogger("SchemaBackup")
        self.consumer = None
        self.producer = None
        self.admin_client = None
        self.timeout_ms = 1000

    def init_consumer(self):
        self.consumer = KafkaConsumer(
            self.topic_name,
            enable_auto_commit=False,
            bootstrap_servers=self.config["bootstrap_uri"],
            client_id=self.config["client_id"],
            security_protocol=self.config["security_protocol"],
            ssl_cafile=self.config["ssl_cafile"],
            ssl_certfile=self.config["ssl_certfile"],
            ssl_keyfile=self.config["ssl_keyfile"],
            sasl_mechanism=self.config["sasl_mechanism"],
            sasl_plain_username=self.config["sasl_plain_username"],
            sasl_plain_password=self.config["sasl_plain_password"],
            auto_offset_reset="earliest",
            metadata_max_age_ms=self.config["metadata_max_age_ms"],
            kafka_client=KarapaceKafkaClient,
        )

    def init_producer(self):
        self.producer = KafkaProducer(
            bootstrap_servers=self.config["bootstrap_uri"],
            security_protocol=self.config["security_protocol"],
            ssl_cafile=self.config["ssl_cafile"],
            ssl_certfile=self.config["ssl_certfile"],
            ssl_keyfile=self.config["ssl_keyfile"],
            sasl_mechanism=self.config["sasl_mechanism"],
            sasl_plain_username=self.config["sasl_plain_username"],
            sasl_plain_password=self.config["sasl_plain_password"],
            kafka_client=KarapaceKafkaClient,
        )

    def init_admin_client(self):
        start_time = time.monotonic()
        wait_time = constants.MINUTE
        while True:
            if time.monotonic() - start_time > wait_time:
                raise Timeout(
                    f"Timeout ({wait_time}) on creating admin client")

            try:
                self.admin_client = KafkaAdminClient(
                    api_version_auto_timeout_ms=constants.
                    API_VERSION_AUTO_TIMEOUT_MS,
                    bootstrap_servers=self.config["bootstrap_uri"],
                    client_id=self.config["client_id"],
                    security_protocol=self.config["security_protocol"],
                    ssl_cafile=self.config["ssl_cafile"],
                    ssl_certfile=self.config["ssl_certfile"],
                    ssl_keyfile=self.config["ssl_keyfile"],
                    kafka_client=KarapaceKafkaClient,
                )
                break
            except (NodeNotReadyError, NoBrokersAvailable, AssertionError):
                self.log.warning(
                    "No Brokers available yet, retrying init_admin_client()")
            except:  # pylint: disable=bare-except
                self.log.exception(
                    "Failed to initialize admin client, retrying init_admin_client()"
                )

            time.sleep(2.0)

    def _create_schema_topic_if_needed(self):
        if self.topic_name != self.config["topic_name"]:
            self.log.info(
                "Topic name overridden, not creating a topic with schema configuration"
            )
            return

        self.init_admin_client()

        start_time = time.monotonic()
        wait_time = constants.MINUTE
        while True:
            if time.monotonic() - start_time > wait_time:
                raise Timeout(
                    f"Timeout ({wait_time}) on creating admin client")

            schema_topic = KafkaSchemaReader.get_new_schema_topic(self.config)
            try:
                self.log.info("Creating schema topic: %r", schema_topic)
                self.admin_client.create_topics(
                    [schema_topic],
                    timeout_ms=constants.TOPIC_CREATION_TIMEOUT_MS)
                self.log.info("Topic: %r created successfully",
                              self.config["topic_name"])
                break
            except TopicAlreadyExistsError:
                self.log.info("Topic: %r already exists",
                              self.config["topic_name"])
                break
            except:  # pylint: disable=bare-except
                self.log.exception(
                    "Failed to create topic: %r, retrying _create_schema_topic_if_needed()",
                    self.config["topic_name"])
                time.sleep(5)

    def close(self):
        self.log.info("Closing schema backup reader")
        if self.consumer:
            self.consumer.close()
            self.consumer = None
        if self.producer:
            self.producer.close()
            self.producer = None
        if self.admin_client:
            self.admin_client.close()
            self.admin_client = None

    def request_backup(self):
        if not self.consumer:
            self.init_consumer()
        self.log.info("Starting schema backup read for topic: %r",
                      self.topic_name)

        values = []
        topic_fully_consumed = False

        while not topic_fully_consumed:

            raw_msg = self.consumer.poll(timeout_ms=self.timeout_ms)
            topic_fully_consumed = len(raw_msg) == 0

            for _, messages in raw_msg.items():
                for message in messages:
                    key = message.key.decode("utf8")
                    try:
                        key = json.loads(key)
                    except json.JSONDecodeError:
                        self.log.debug(
                            "Invalid JSON in message.key: %r, value: %r",
                            message.key, message.value)
                    value = None
                    if message.value:
                        value = message.value.decode("utf8")
                        try:
                            value = json.loads(value)
                        except json.JSONDecodeError:
                            self.log.debug(
                                "Invalid JSON in message.value: %r, key: %r",
                                message.value, message.key)
                    values.append((key, value))

        ser = json.dumps(values)
        if self.backup_location:
            with open(self.backup_location, "w") as fp:
                fp.write(ser)
                self.log.info("Schema backup written to %r",
                              self.backup_location)
        else:
            print(ser)
            self.log.info("Schema backup written to stdout")
        self.close()

    def restore_backup(self):
        if not os.path.exists(self.backup_location):
            raise BackupError("Backup location doesn't exist")

        self._create_schema_topic_if_needed()

        if not self.producer:
            self.init_producer()
        self.log.info("Starting backup restore for topic: %r", self.topic_name)

        values = None
        with open(self.backup_location, "r") as fp:
            raw_msg = fp.read()
            values = json.loads(raw_msg)

        if not values:
            return

        for item in values:
            key = encode_value(item[0])
            value = encode_value(item[1])
            future = self.producer.send(self.topic_name, key=key, value=value)
            self.producer.flush(timeout=self.timeout_ms)
            msg = future.get(self.timeout_ms)
            self.log.debug("Sent kafka msg key: %r, value: %r, offset: %r",
                           key, value, msg.offset)
        self.close()
Ejemplo n.º 53
0
class Producer:
    """
    封装kafka-python KafkaProducer
    """
    def __init__(self):
        pass

    def __enter__(self):
        self.cfg = Config().cfg
        self.producer = KafkaProducer(
            bootstrap_servers=self.cfg["serList"],
            # api_version=self.cfg["apiVersion"],
            api_version_auto_timeout_ms=self.cfg["autoVersionTimeout"],
            security_protocol=self.cfg["protocol"],
            sasl_mechanism=self.cfg["mechanism"],
            sasl_kerberos_service_name=self.cfg["kerverosSerName"],
        )
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.producer.close()

    def flush(self):
        """
        调用此方法会使得所有缓存记录变成立即可发送状态.(一般用于send之后, 需要刷新)
        :return:
        """
        try:
            self.producer.flush(timeout=TIME_OUT)
        except KafkaTimeoutError:
            log.tag_error(KafkaInfo.KafkaProducer,
                          "Flush buffered record failed, TimeOut")
            raise ActionError(KafkaErr.FlushFailed)

    def metrics(self):
        """
        获取producer的性能记录(包含各个kafka broker)
        :return:
        """
        performance = self.producer.metrics()
        return performance

    def partition_set_get(self, topic_name: str):
        """
        获取topic的所有分区
        :param topic_name:
        :return: set
        """
        return self.producer.partitions_for(topic_name)

    def send_message(self, topic_name: str, value: bytes, key: str):
        """
        Producer产生数据
        :param topic_name: topic where the message will be published
        :param value: message value
        :param key: key to associate with the message
        :return:
        """
        try:
            result = self.producer.send(topic_name,
                                        value=value,
                                        key=key.encode("utf-8")).add_errback(
                                            self.send_err,
                                            topic=topic_name,
                                            value=value,
                                            key=key)
        except KafkaTimeoutError:
            log.tag_warn(
                KafkaInfo.KafkaProducer,
                "Kafka send data timeout, topic: %s, key: %s, msg: %s" %
                (topic_name, key, value.decode("utf-8")))
            raise ActionError(KafkaErr.SendDataFailed)
        return result

    @staticmethod
    def send_err(topic: str, value: bytes, key: str):
        """
        producer send data failed callback function
        :param topic:
        :param value:
        :param key:
        :return:
        :return:
        """
        log.tag_error(
            KafkaInfo.KafkaProducer, "Kafka send data failed, topic: %s, "
            "key: %s msg: %s" % (topic, key, value.decode("utf-8")))
        raise ActionError(KafkaErr.SendDataFailed)
Ejemplo n.º 54
0
class ProducerClient(TestAdapterLib.Adapter):
	@doc_public
	def __init__ (self, parent, name=None, bootstrap_servers=None, 
											debug=False, agentSupport=False, agent=None, 
											shared=False, verbose=True, 
											logEventSent=True, logEventReceived=True):
		"""
		KAFKA Producer client Adapter. Mapping of kafka-python KafkaProducer
		
		@param parent: parent testcase
		@type parent: testcase
		
		@bootstrap_servers: Kafka broker used to boostrap at connect call (list of ip address port )
		@type bootstrap_servers: List
		
		@param agent: agent to use when this mode is activated
		@type agent: string/None
		
		@param name: adapter name used with from origin/to destination (default=None)
		@type name: string/none
		
		@param debug: active debug mode (default=False)
		@type debug:	boolean
		
		@param shared: shared adapter (default=False)
		@type shared:	boolean
		"""
		TestAdapterLib.Adapter.__init__(self, name = __NAME__, parent = parent, 
																									debug=debug, realname=name,
																									agentSupport=agentSupport, 
																									agent=agent, shared=shared,
																									caller=TestAdapterLib.caller(),
																									agentType=AGENT_TYPE_EXPECTED)
		self.parent = parent
		self.codecX2D = Xml2Dict.Xml2Dict()
		self.codecD2X = Dict2Xml.Dict2Xml(coding = None)
		self.logEventSent = logEventSent
		self.logEventReceived = logEventReceived
		self.parent = parent
		self.cfg = {}
		if agent is not None:
			self.cfg['agent'] = agent
			self.cfg['agent-name'] = agent['name']
		self.cfg['agent-support'] = agentSupport
		
		self.TIMER_ALIVE_AGT = TestAdapterLib.Timer(parent=self, duration=20, 
																																			name="keepalive-agent", 
																																			callback=self.aliveAgent,
																																			logEvent=False, enabled=True)
		self.__checkConfig()
		
		# initialize the agent with no data
		if agent is not None:
			if self.cfg['agent-support']:
				self.prepareAgent(data={'shared': shared})
				if self.agentIsReady(timeout=30) is None: 
					raise Exception("Agent %s is not ready" % self.cfg['agent-name'] )
				self.TIMER_ALIVE_AGT.start()

		self.bootstrap_servers = bootstrap_servers

		if debug:
			self.__getKafkaClientLogger()

	def __checkConfig(self):
		"""
		"""
		self.debug("config: %s" % self.cfg)		
		self.warning('Agent used Name=%s Type=%s' % (self.cfg['agent']['name'], 
																																										self.cfg['agent']['type']) ) 
	
	
	def encapsule(self, *input_layers):
		"""
		Encapsule layers in template message
		"""
		if self.cfg['agent-support']:
			layer_agent= TestTemplatesLib.TemplateLayer('AGENT')
			layer_agent.addKey(name='name', data=self.cfg['agent']['name'] )
			layer_agent.addKey(name='type', data=self.cfg['agent']['type'] )

		tpl = TestTemplatesLib.TemplateMessage()

		if self.cfg['agent-support']:
			tpl.addLayer(layer=layer_agent)
		for layer in input_layers:
			tpl.addLayer(layer=layer)

		return tpl
		
		
	def onReset(self):
		"""
		Called automaticly on reset adapter
		"""
		# stop timer
		self.TIMER_ALIVE_AGT.stop()
		
		# cleanup remote agent
		self.resetAgent()

	def receivedNotifyFromAgent(self, data):
		"""
		Function to reimplement
		"""
		if 'cmd' in data:
			if data['cmd'] == AGENT_INITIALIZED:
				tpl = TestTemplatesLib.TemplateMessage()
				layer = TestTemplatesLib.TemplateLayer('AGENT')
				layer.addKey("ready", True)
				layer.addKey(name='name', data=self.cfg['agent']['name'] )
				layer.addKey(name='type', data=self.cfg['agent']['type'] )
				tpl.addLayer(layer= layer)
				self.logRecvEvent( shortEvt = "Agent Is Ready" , tplEvt = tpl )

			elif data['cmd'] == "producer_{0}".format(CONNECT):
				self.__kafka_connected = True
				tpl = templates.kafka_ops(method=CONNECT,bootstrap_servers=self.bootstrap_servers)
				self.logRecvEvent( shortEvt = "connected", tplEvt = self.encapsule(self.producerTpl ,tpl))

			elif data['cmd'] == "producer_{0}".format(SEND):
				record_metadata = data['result']
				self.__kafka_send = True
				rec = { "Topic":record_metadata[0], 
										"Partition": record_metadata[1] , 
										"Offset":record_metadata[3] , 
										"Timestamp": record_metadata[4] ,
										"Checksum": record_metadata[5], 
										"Serialized_key_size": record_metadata[6], 
										"Serialized_value_size": record_metadata[7]}
				tpl = templates.kafka_ops(method=SEND, more=rec)
				self.logRecvEvent( shortEvt = "sended", tplEvt =  self.encapsule(self.producerTpl ,tpl))

			elif data['cmd'] =="producer_{0}".format(FLUSH) :
				tpl = templates.kafka_ops(method=FLUSH)
				self.logRecvEvent( shortEvt = "flushed", tplEvt =  self.encapsule(self.producerTpl ,tpl))	

			elif data['cmd'] =="producer_{0}".format(PARTITIONS_FOR) :
				partitions = data['result']
				tpl = templates.kafka_ops(method=PARTITIONS_FOR, partitions=partitions)
				self.logRecvEvent( shortEvt = "partitions_for", tplEvt =  self.encapsule(self.producerTpl ,tpl))				
			elif data['cmd'] == "producer_{0}".format(CLOSE):
				tpl = templates.kafka_ops(method=CLOSE)
				self.logRecvEvent( shortEvt = "closed", tplEvt =  self.encapsule(self.producerTpl ,tpl))			
		else:
			self.warning( 'Notify received from agent: %s' % data )

	def receivedErrorFromAgent(self, data):
		"""
		Function to reimplement
		"""
		if "cmd" in data:
			if data['cmd'] in [ CONNECT, CLOSE, SEND, FLUSH,PARTITIONS_FOR	]:
				tpl = self.encapsule(self.producerTpl, templates.response_err(msg=data['err-msg'], method=data['cmd'] ))
				self.logRecvEvent( shortEvt = "response error", tplEvt = tpl )
				
			else:
				self.error("unknown command received: %s" % data["cmd"])
				
		else:
			self.error( 'Generic error: %s' % data )

	def receivedDataFromAgent(self, data):
		"""
		Function to reimplement
		"""
		self.warning( 'Data received from agent: %s' % data )
		
	def prepareAgent(self, data):
		"""
		Prepare agent
		"""
		self.parent.sendReadyToAgent(adapterId=self.getAdapterId(), 
																								agentName=self.cfg['agent-name'], 
																								agentData=data)
																										
	def initAgent(self, data):
		"""
		Init agent
		"""
		self.parent.sendInitToAgent(adapterId=self.getAdapterId(), 
																						agentName=self.cfg['agent-name'], 
																						agentData=data)
																								
	def resetAgent(self):
		"""
		Reset agent
		"""
		self.parent.sendResetToAgent(adapterId=self.getAdapterId(), 
																								agentName=self.cfg['agent-name'], 
																								agentData='')
																										
	def aliveAgent(self):
		"""
		Keep alive agent
		"""
		self.parent.sendAliveToAgent(adapterId=self.getAdapterId(), 
																							agentName=self.cfg['agent-name'], 
																							agentData='')
		self.TIMER_ALIVE_AGT.restart()

	def sendInitToAgent(self, data):
		"""
		"""
		self.parent.sendInitToAgent(adapterId=self.getAdapterId(), 
																						agentName=self.cfg['agent-name'], 
																						agentData=data)
																								
	def sendNotifyToAgent(self, data):
		"""
		"""
		self.parent.sendNotifyToAgent(adapterId=self.getAdapterId(), 
																								agentName=self.cfg['agent-name'], 
																								agentData=data)

	def sendResetToAgent(self, data):
		"""
		"""
		self.parent.sendResetToAgent(adapterId=self.getAdapterId(), 
																							agentName=self.cfg['agent-name'], 
																							agentData=data)

	def agentIsReady(self, timeout=1.0):
		"""
		Waits to receive "agent ready" event until the end of the timeout
		
		@param timeout: time max to wait to receive event in second (default=1s)
		@type timeout: float	
		
		@return: an event matching with the template or None otherwise
		@rtype: templatemessage		
		"""
		tpl = TestTemplatesLib.TemplateMessage()
		layer = TestTemplatesLib.TemplateLayer('AGENT')
		layer.addKey("ready", True)
		layer.addKey(name='name', data=self.cfg['agent']['name'] )
		layer.addKey(name='type', data=self.cfg['agent']['type'] )
		tpl.addLayer(layer= layer)
		evt = self.received( expected = tpl, timeout = timeout )
		return evt
		
	def __getKafkaClientLogger(self):

		logger = logging.getLogger('kafka')
		logger.addHandler(logging.StreamHandler(sys.stdout))
		logger.setLevel(logging.DEBUG)

	@doc_public
	def connect(self, **kargs ):
		"""
		Instantiate the KafkaProducer and Fetch Kafka Cluster Metadata

		@param kargs: keyword arguments from KafkaProducer class: 
		@type kargs: keyword 
		
		"""
		if 'bootstrap_servers' in kargs:
			bootstrap_servers = kargs.pop('bootstrap_servers')
		else:
			bootstrap_servers=self.bootstrap_servers

		# Log start connexion  event
		self.producerTpl = templates.kafka_connect(api=PRODUCER,bootstrap_servers=bootstrap_servers, **kargs)
		tpl = templates.kafka_ops(method=CONNECT,bootstrap_servers=bootstrap_servers, **kargs)
		self.logSentEvent( shortEvt = "connection", tplEvt = self.encapsule(self.producerTpl,tpl))

		self.__kafka_connected = False

		# Agent mode
		if self.cfg['agent-support']:
			remote_cfg = {
							'cmd': "producer_{0}".format(CONNECT),
							'bootstrap_servers': bootstrap_servers,
							'kargs': kargs
						}
			self.sendNotifyToAgent(data=remote_cfg)
				
		else:
			try:
				self.producer = KafkaProducer(bootstrap_servers=bootstrap_servers, **kargs )
				tpl = templates.kafka_ops(method=CONNECT,bootstrap_servers=bootstrap_servers, **kargs)
				self.logRecvEvent( shortEvt = "connected", tplEvt = self.encapsule(self.producerTpl,tpl))
			except KafkaError  as e:
				tpl = self.encapsule(self.producerTpl,  templates.response_err(msg=e, method=CONNECT ))
				self.logRecvEvent( shortEvt = "response error", tplEvt = tpl )

	@doc_public			
	def send(self, topic, **kargs):
		"""
		Publish a message to a topic.

		@topic (str): topic where the message will be published    
		
		@value (optional): message value as bytes.
		
		@partition (int, optional): optionally specify a partition. If not set, the partition will be selected using the configured 'partitioner'.
		
		@key (optional): a key to associate with the message. Can be used to determine which partition to send the message to. 
		
		@timestamp_ms (int, optional): epoch milliseconds (from Jan 1 1970 UTC) to use as the message timestamp. Defaults to current time.
		"""		
		tpl = templates.kafka_ops(method=SEND, **kargs)
		self.logSentEvent( shortEvt = "req send", tplEvt = self.encapsule(self.producerTpl ,tpl))

		# Timeout for record metadata retreving
		if "timeout" in kargs:
			timeout = kargs.pop("timeout")
		else:
			timeout=2
		if self.cfg['agent-support']:
			remote_cfg = {
							'cmd': "producer_{0}".format(SEND),
							'topic': topic,
							'timeout': timeout,
							'kargs': kargs
						}
			self.sendNotifyToAgent(data=remote_cfg)
		else:
			try:
				future = self.producer.send(topic,**kargs)
				record_metadata=future.get(timeout=timeout)

				rec = { "Topic":record_metadata[0], 
										"Partition": record_metadata[1] , 
										"Offset":record_metadata[3] , 
										"Timestamp": record_metadata[4] ,
										"Checksum": record_metadata[5], 
										"Serialized_key_size": record_metadata[6], 
										"Serialized_value_size": record_metadata[7]}
				tpl = templates.kafka_ops(method=SEND, more=rec)
				self.logRecvEvent( shortEvt = "resp send", tplEvt =  self.encapsule(self.producerTpl,tpl))
			except KafkaError  as e:
				tpl = self.encapsule(self.producerTpl,  templates.response_err(msg=e, method=SEND ))
				self.logRecvEvent( shortEvt = "response error", tplEvt = tpl )

	@doc_public	
	def partitions_for(self, topic):
		"""
		All fonction documentation available on http://kafka-python.readthedocs.io.
		"""		
		tpl = templates.kafka_ops(method=PARTITIONS_FOR, topic=topic)
		self.logSentEvent( shortEvt = "req partitions_for", tplEvt = self.encapsule(self.producerTpl ,tpl))

		if self.cfg['agent-support']:
			remote_cfg = {
							'cmd': "producer_{0}".format(PARTITIONS_FOR),
							'topic': topic
						}
			self.sendNotifyToAgent(data=remote_cfg)
		else:
			try:
				partitions = self.producer.partitions_for(topic)	
				tpl = templates.kafka_ops(method=PARTITIONS_FOR,topic=topic, partitions=partitions)
				self.logRecvEvent( shortEvt = "resp partitions_for", tplEvt =  self.encapsule(self.producerTpl,tpl))	
			except KafkaError  as e:
				tpl = self.encapsule(self.producerTpl,  templates.response_err(msg=e, method=PARTITIONS_FOR ))
				self.logRecvEvent( shortEvt = "response error", tplEvt = tpl )				

	@doc_public
	def flush(self, timeout=None):
		"""
		All fonction documentation available on http://kafka-python.readthedocs.io.
		"""		
		tpl = templates.kafka_ops(method=FLUSH, timeout=timeout)
		self.logSentEvent( shortEvt = "req flush", tplEvt = self.encapsule(self.producerTpl,tpl))	

		if self.cfg['agent-support']:
			remote_cfg = {
							'cmd': "producer_{0}".format(FLUSH),
							'timeout': timeout
						}
			self.sendNotifyToAgent(data=remote_cfg)
		else:
			try:
				self.producer.flush(timeout)	
				tpl = templates.kafka_ops(method=FLUSH)
				self.logRecvEvent( shortEvt = "resp flush", tplEvt =  self.encapsule(self.producerTpl,tpl))	
			except KafkaError  as e:
				tpl = self.encapsule(self.producerTpl,  templates.response_err(msg=e, method=FLUSH ))
				self.logRecvEvent( shortEvt = "response error", tplEvt = tpl )

	@doc_public
	def close(self, timeout=None):
		"""
		All fonction documentation available on http://kafka-python.readthedocs.io.
		"""		
		tpl = templates.kafka_ops(method=CLOSE, timeout=timeout)
		self.logSentEvent( shortEvt = "req close", tplEvt = self.encapsule(self.producerTpl,tpl))	

		if self.cfg['agent-support']:
			remote_cfg = {
							'cmd': "producer_{0}".format(CLOSE),
							'timeout': timeout
						}
			self.sendNotifyToAgent(data=remote_cfg)
		else:
			try:
				self.producer.close(timeout=timeout)
				tpl = templates.kafka_ops(method=CLOSE,timeout=timeout)
				self.logRecvEvent( shortEvt = "closed", tplEvt =  self.encapsule(self.producerTpl,tpl))	
			except KafkaError  as e:
				tpl = self.encapsule(self.producerTpl,  templates.response_err(msg=e, method=CLOSE ))
				self.logRecvEvent( shortEvt = "response error", tplEvt = tpl )
				
	@doc_public
	def isSend(self, timeout=2, record=None):
		"""
		Wait to receive response from "send" request and match returned RecordMetadata  until the end of the timeout.

		@param timeout: time max to wait to receive event in second (default=2s)
		
		@type timeout: float	
		
		@param offset: Optional RecordMetadata that we expect to be assigned to consumer 
		
		@type offset:  RecordMetadata
		"""
		TestAdapterLib.check_timeout(caller=TestAdapterLib.caller(), timeout=timeout)
		
		if record == None:
			record = { "Topic":TestOperatorsLib.Any(), 
											"Partition": TestOperatorsLib.Any(), 
											"Offset":TestOperatorsLib.Any() , 
											"Timestamp":TestOperatorsLib.Any() ,
											"Checksum": TestOperatorsLib.Any(), 
											"Serialized_key_size":TestOperatorsLib.Any(),
											"Serialized_value_size": TestOperatorsLib.Any()}
		expected = templates.kafka_ops(method=SEND, more=record)
		# try to match the template 
		evt = self.received( expected=self.encapsule( self.producerTpl ,expected ), timeout=timeout )
		return evt

	@doc_public		
	def isConnect(self, timeout=2):
		"""
		Wait to receive response from "connect" request until the end of the timeout

		@param timeout: time max to wait to receive event in second (default=2s)
		@type timeout: float		
		"""
		TestAdapterLib.check_timeout(caller=TestAdapterLib.caller(), timeout=timeout)
		
		# construct the expected template
		expected = templates.kafka_ops(method=CONNECT, bootstrap_servers=self.bootstrap_servers)
		# try to match the template 
		evt = self.received( expected=self.encapsule( self.producerTpl ,expected), timeout=timeout )
		return evt	

	@doc_public
	def isFlush(self, timeout=2):
		"""
		Wait to receive response from "flush" request until the end of the timeout

		@param timeout: time max to wait to receive event in second (default=2s)
		@type timeout: float		
		"""
		TestAdapterLib.check_timeout(caller=TestAdapterLib.caller(), timeout=timeout)
		
		# construct the expected template
		expected = templates.kafka_ops(method=FLUSH)
		# try to match the template 
		evt = self.received( expected=self.encapsule( self.producerTpl ,expected), timeout=timeout )
		return evt		
	
	@doc_public    
	def isClose(self, timeout=2):
		"""
		Wait to receive response from "close" request until the end of the timeout

		@param timeout: time max to wait to receive event in second (default=2s)
		@type timeout: float		
		"""
		TestAdapterLib.check_timeout(caller=TestAdapterLib.caller(), timeout=timeout)
		
		# construct the expected template
		expected = templates.kafka_ops(method=CLOSE)
		# try to match the template 
		evt = self.received( expected=self.encapsule( self.producerTpl ,expected), timeout=timeout )
		return evt		
		
	@doc_public
	def isPartitions_for(self, timeout=2,partitions=None):
		"""
		Wait to receive response from "partitions_for" request and match returned Topics until the end of the timeout.

		@param timeout: time max to wait to receive event in second (default=2s)
		@type timeout: float		

		@param offset: Optional list that we expect to be view by producer 
		@type offset: list of of Topics
		"""
		TestAdapterLib.check_timeout(caller=TestAdapterLib.caller(), timeout=timeout)

		if partitions == None:
			partitions= { "partitions":TestOperatorsLib.Any()}
		expected = templates.kafka_ops(method=PARTITIONS_FOR,more=partitions)
		# try to match the template 
		evt = self.received( expected=self.encapsule( self.producerTpl ,expected), timeout=timeout )
		return evt	
Ejemplo n.º 55
0
    def handle(self, *args, **options):
        """实时数据
        """
        timeout = options.get("timeout")
        try:
            agent_id = Agent.objects.get(id=options.get("agent_id"))
            kafka_broker = SelfServiceConf.objects.get(
                agent_id=agent_id, service='kafka')
        except Agent.DoesNotExist:
            logger.info("no agent")
            return
        except SelfServiceConf.DoesNotExist:
            logger.info("no kafka")
            return
        # 实例化一个KafkaProducer示例,用于向Kafka投递消息
        producer = KafkaProducer(
            bootstrap_servers=kafka_broker.host,
            # bootstrap_servers="hd-datanode2:9092",
            value_serializer=lambda v: json.dumps(v).encode('utf-8'),
            api_version=(0, 8, 0)
        )
        # metrics = producer.metrics()
        # print(metrics)
        dst_ip = options.get("dst_ip")
        try:
            dst_ip_location = geoip.find_ip_location(dst_ip)[1]
        except Exception:
            dst_ip_location = "北京"

        while True:
            time_now = timezone.localtime(timezone.now())
            logger.info(time_now.strftime("%Y-%m-%d %H:%M:%S"))
            timestamp = int(time.mktime(time_now.timetuple()))
            # producer.send('web-log', '{"status":"301","body_bytes_sent":"178","bytes_sent":"500","prospector":{"type":"log"},"upstream_status":"\"301\"","http_referer":"\"http://[139.219.97.13]\"","upstream_response_time":"\"0.160\"","time_iso8601":"2018-03-21T15:41:43+08:00","tags":["beats_input_codec_plain_applied"],"request_length":"146","server_protocol":"HTTP/1.1","type":"261263cd-3895-4ec8-88a1-7468cca63a4a","beat":{"name":"DED-Azure-North-QINGSONG-QSSEC-NODE01","version":"6.2.2","hostname":"DED-Azure-North-QINGSONG-QSSEC-NODE01"},"offset":297961,"method":"GET","http_x_forwarded_for":"\"-\"","@timestamp":"2018-03-21T07:41:47.065Z","request_time":"0.160","http_user_agent":"\"Go-http-client/1.1\"","@version":"1","request_uri":"/","upstream_addr":"\"139.219.226.91:443\"","host":["DED-Azure-North-QINGSONG-QSSEC-NODE01","www.qssec.com"],"remote_addr":"139.219.100.132","hostname":"ded-azure-north-qingsong-qssec-node01","real_ip":"139.219.100.132","source":"/usr/local/nginx/logs/access.www.qssec.com_ssl.log","message":"2018-03-21T15:41:43+08:00 ded-azure-north-qingsong-qssec-node01 139.219.100.132 139.219.100.132 www.qssec.com GET / HTTP/1.1 301 146 500 178 0.160 \"139.219.226.91:443\" \"301\" \"-\" \"0.160\" \"http://[139.219.97.13]\" \"Go-http-client/1.1\" \"-\"","upstream_cache_status":"\"-\""}')
            # UV

            realtime_uv = cache.get("ssa_realtime_uv")
            if not realtime_uv:
                realtime_uv = random.randint(20, 35)
                cache.set("ssa_realtime_uv", realtime_uv, timeout=timeout)

            producer.send('realtime_uv', [realtime_uv])
            logger.info("realtime_uv: {}".format(realtime_uv))

            # 实时访问
            web_log_ip_map = cache.get("ssa_web_log_ip_map")
            if not web_log_ip_map:
                ips = geoip.random_china_ip(realtime_uv)
                web_log_ip_map = [
                    {"ip": ip[0], "location": ip[1]} for ip in ips
                ]
                cache.set("ssa_web_log_ip_map",
                          web_log_ip_map, timeout=timeout)

            producer.send('web_log_ip_map', web_log_ip_map)
            logger.info("web_log_ip_map: {}".format(len(web_log_ip_map)))

            # 实时攻击
            attack_map = cache.get("ssa_attack_map")
            if not attack_map:
                ips = geoip.random_china_ip(random.randint(2, 50))
                attack_map = []
                for ip in ips:
                    level_random = random.randint(0, 100)
                    # pos = [98, 95, 90, 80, 0]
                    pos = [90, 75, 50, 0]
                    index = 3
                    for idx, p in enumerate(pos):
                        if level_random > p:
                            index = idx
                    level = ["严重", "高", "中", "低"][index]
                    attack_map.append(
                        {
                            "time": timezone.localtime(timezone.datetime.fromtimestamp(random.randint(timestamp-timeout, timestamp)).replace(tzinfo=timezone.get_current_timezone())).strftime("%Y-%m-%d %H:%M:%S"),
                            "dst_ip": dst_ip_location,
                            "dst_ip_location": dst_ip_location,
                            "level": level,
                            "src_ip": ip[0],
                            "src_ip_location": ip[1]
                        }
                    )
                attack_map = sorted(attack_map, key=lambda elem: elem["time"])

                cache.set("ssa_attack_map", attack_map, timeout=timeout)
            producer.send('attack_map', attack_map)
            logger.info("attack_map: {}".format(len(attack_map)))
            producer.flush()

            time.sleep(options.get("frequency"))
Ejemplo n.º 56
0
def publish_messages(networks=None,
                     select_dates=None,
                     metrics=None,
                     is_test=False):
    if networks is None or len(networks) == 0:
        networks = all_networks
    if select_dates is None or len(select_dates) == 0:
        # scan over the past 5 days if no date was set
        select_dates = []
        for offset in range(1, 5):
            select_dates.append(date.today() - timedelta(days=offset))
    if metrics is None or len(metrics) == 0:
        metrics = ['NLNMDeviationMetric:0.5-1']
    for select_date, network, metric in product(select_dates, networks,
                                                metrics):
        date_string = select_date.strftime("%Y-%m-%d")
        output = call_dqa(network=network,
                          metric=metric,
                          begin=date_string,
                          end=date_string,
                          format='csv')
        # set up the kafka (producer) connection here
        # default blocktime is 60000 ms -- so let's try multiplying by 5
        blocktime = 10000
        producer = KafkaProducer(
            bootstrap_servers='igskcicgvmkafka.cr.usgs.gov:9092',
            client_id='producer-from-dqa',
            acks=0,
            max_block_ms=blocktime,
            value_serializer=lambda v: json.dumps(v).encode('utf-8'))
        # each line is effectively a row in the database
        data = csv.reader(output.splitlines(), skipinitialspace=True)
        # note that CSV class works in such a way that the iterator is the only reasonable way to
        # access the data once converted into it -- fortunately the first entry can still easily be
        # checked to determine if there's really any content to iterator over
        for record in data:
            # this will happen if DQA doesn't have data for a given day/metric, not an empty list:
            if str(record[0]).startswith("Error"):
                if is_test:
                    print("Nothing available for", select_date, network,
                          metric)
                break  # go back to outer loop, no data in this record exists
            # now we get the fields and jsonify them for publication
            # value order is how they come out of the call_dqa method
            (r_date, network, station, location, channel, metric,
             value) = record
            if is_test:
                print(r_date, network, station, location, channel, metric,
                      value)
            # get the topic name derived from the metric and run type
            topic_name = topic_fix(metric, is_test)
            # json format description:
            # https://github.com/usgs/earthquake-detection-formats (cont.)
            # /blob/master/format-docs/StationInfo.md
            # we have some custom formats added here to disambiguate metric
            # and to give the date of data this metric was evaluated upon
            message = {
                "Type": "StationInfo",
                "Site": {
                    "Network": network,
                    "Station": station,
                    "Location": location,
                    "Channel": channel
                },
                "Quality": value,
                "Date": date_string,
                "Enable": "true",
                "Metric": metric
            }
            # next step is to actually send this message
            # metric (topic) name might have disallowed character in it
            # print(topic_name, message)
            producer.send(topic_name, message)
        producer.flush()
Ejemplo n.º 57
0
class CheckKafka(PubSubNagiosPlugin):
    def __init__(self):
        # Python 2.x
        super(CheckKafka, self).__init__()
        # Python 3.x
        # super().__init__()
        self.name = 'Kafka'
        self.default_port = 9092
        self.producer = None
        self.consumer = None
        self.topic = None
        self.client_id = 'Hari Sekhon ' + os.path.basename(
            get_topfile()) + ' ' + __version__
        self.group_id = self.client_id + ' ' + str(
            os.getpid()) + ' ' + random_alnum(10)
        self.acks = '1'
        self.retries = 0
        self.partition = None
        self.topic_partition = None
        self.brokers = None
        self.timeout_ms = None
        self.start_offset = None

    def add_options(self):
        # super(CheckKafka, self).add_options()
        # TODO: (host_envs, default_host) = getenvs2('HOST', default_host, name)
        # TODO: env support for Kafka brokers
        self.add_opt(
            '-B',
            '--brokers',
            metavar='broker_list',
            default='localhost:9092',
            help=
            'Kafka Broker seed list in form host[:port],host2[:port2]... (default: localhost:9092)'
        )
        self.add_opt('-T', '--topic', help='Kafka Topic')
        self.add_opt('-p',
                     '--partition',
                     type=int,
                     help='Kafka Partition (default: 0)',
                     default=0)
        self.add_opt(
            '-a',
            '--acks',
            default=1,
            choices=[1, 'all'],
            help=
            'Acks to require from Kafka. Valid options are \'1\' for Kafka ' +
            'partition leader, or \'all\' for all In-Sync Replicas (may block causing '
            + 'timeout if replicas aren\'t available, default: 1)')
        self.add_opt(
            '-s',
            '--sleep',
            metavar='secs',
            help=
            'Sleep in seconds between producing and consuming from given topic (default: 0.5)'
        )
        self.add_opt('--list-topics',
                     action='store_true',
                     help='List Kafka topics from broker(s) and exit')
        self.add_opt('--list-partitions',
                     action='store_true',
                     help='List Kafka topic paritions from broker(s) and exit')
        self.add_thresholds(default_warning=1, default_critical=2)

    def run(self):
        try:
            super(CheckKafka, self).run()
        except KafkaError as _:
            raise CriticalError(_)

    def get_topics(self):
        self.consumer = KafkaConsumer(bootstrap_servers=self.brokers,
                                      client_id=self.client_id,
                                      request_timeout_ms=self.timeout_ms)
        return self.consumer.topics()

    def print_topics(self):
        print('Kafka Topics:\n')
        for topic in self.get_topics():
            print(topic)

    def get_topic_partitions(self, topic):
        self.consumer = KafkaConsumer(topic,
                                      bootstrap_servers=self.brokers,
                                      client_id=self.client_id,
                                      request_timeout_ms=self.timeout_ms)
        if topic not in self.get_topics():
            raise CriticalError(
                "topic '{0}' does not exist on Kafka broker".format(topic))
        partitions = self.consumer.partitions_for_topic(topic)
        assert isSet(partitions)
        return partitions

    def print_topic_partitions(self, topic):
        print('Kafka topic \'{0}\' partitions:\n'.format(topic))
        #for partition in self.get_topic_partitions(topic):
        #    print(partition)
        print(list(self.get_topic_partitions(topic)))
        print()

    def process_args(self):
        self.brokers = self.get_opt('brokers')
        # TODO: add broker list validation back in
        # validate_hostport(self.brokers)
        log_option('brokers', self.brokers)
        self.timeout_ms = max(self.timeout * 1000 - 1000, 1000)

        list_topics = self.get_opt('list_topics')
        list_partitions = self.get_opt('list_partitions')
        if list_topics:
            self.print_topics()
            sys.exit(ERRORS['UNKNOWN'])
        self.topic = self.get_opt('topic')

        if self.topic:
            validate_chars(self.topic, 'topic', 'A-Za-z-')
        elif list_topics or list_partitions:
            pass
        else:
            self.usage('--topic not specified')

        if list_partitions:
            if self.topic:
                self.print_topic_partitions(self.topic)
            else:
                for topic in self.get_topics():
                    self.print_topic_partitions(topic)
            sys.exit(ERRORS['UNKNOWN'])

        self.partition = self.get_opt('partition')
        # technically optional, will hash to a random partition, but need to know which partition to get offset
        # if self.partition is not None:
        validate_int(self.partition, "partition", 0, 10000)
        self.topic_partition = TopicPartition(self.topic, self.partition)
        self.acks = self.get_opt('acks')
        log_option('acks', self.acks)
        self.validate_thresholds()

    def subscribe(self):
        self.consumer = KafkaConsumer(
            #self.topic,
            bootstrap_servers=self.brokers,
            # client_id=self.client_id,
            # group_id=self.group_id,
            request_timeout_ms=self.timeout_ms)
        #key_serializer
        #value_serializer
        log.debug('partition assignments: {0}'.format(
            self.consumer.assignment()))

        # log.debug('subscribing to topic \'{0}\' parition \'{1}\''.format(self.topic, self.partition))
        # self.consumer.subscribe(TopicPartition(self.topic, self.partition))
        # log.debug('partition assignments: {0}'.format(self.consumer.assignment()))

        log.debug('assigning partition {0} to consumer'.format(self.partition))
        # self.consumer.assign([self.partition])
        self.consumer.assign([self.topic_partition])
        log.debug('partition assignments: {0}'.format(
            self.consumer.assignment()))

        self.start_offset = self.consumer.position(self.topic_partition)
        # self.start_offset = 0
        log.debug('recorded starting offset \'{0}\''.format(self.start_offset))
        # self.consumer.pause()

    def publish(self):
        log.debug('creating producer')
        self.producer = KafkaProducer(bootstrap_servers=self.brokers,
                                      client_id=self.client_id,
                                      acks=self.acks,
                                      batch_size=0,
                                      max_block_ms=self.timeout_ms,
                                      request_timeout_ms=self.timeout_ms)
        #key_serializer
        #value_serializer
        log.debug('producer.send()')
        self.producer.send(self.topic,
                           key=self.key,
                           partition=self.partition,
                           value=self.publish_message)
        log.debug('producer.flush()')
        self.producer.flush()

    def consume(self):
        self.consumer.assign([self.topic_partition])
        log.debug('consumer.seek({0})'.format(self.start_offset))
        self.consumer.seek(self.topic_partition, self.start_offset)
        # self.consumer.resume()
        log.debug('consumer.poll(timeout_ms={0})'.format(self.timeout_ms))
        obj = self.consumer.poll(timeout_ms=self.timeout_ms)
        log.debug('msg object returned: %s', obj)
        msg = None
        try:
            for consumer_record in obj[self.topic_partition]:
                if consumer_record.key == self.key:
                    msg = consumer_record.value
                    break
        except KeyError:
            raise UnknownError('TopicPartition key was not found in response')
        if msg is None:
            raise UnknownError(
                "failed to find matching consumer record with key '{0}'".
                format(self.key))
        return msg
Ejemplo n.º 58
0
def start(kafka_server_address, workflow_schema_path, queue_path, service_hub_path, library_path):
  #Logging
  logFactory = LoggerFactory(get_level(), get_logfile_path())
  log = logFactory.get_logger()
  log.setLevel(logging.INFO)
  logfile = logging.FileHandler("backendlog")
  log.addHandler(logfile)
  print("Workflow consumer started... " + "using workflow schema located at: " + workflow_schema_path)
  #Kafka
  event = Config.get_event("workflow_execute-workflow")
  consumer = KafkaConsumer(event, bootstrap_servers=kafka_server_address)
  #Avro
  schema = avro.schema.parse(open(workflow_schema_path).read())
  #Infinity loop that consumes workflow execution events
  for msg in consumer:
    print("Received workflow execution message")
    try:
        print("Parsing received workflow")
        bytes_reader = io.BytesIO(msg.value)
        decoder = avro.io.BinaryDecoder(bytes_reader)
        reader = avro.io.DatumReader(schema)
        workflow = reader.read(decoder)
        
        filename = workflow["user"] + "_" + hashlib.md5(msg.value).hexdigest() + "_" + str(time.time()).replace(".", "_") + ".py"
        path = os.path.join(queue_path, filename)

        print("Creating mysql entry")

        mysql_info = get_mysql_info()
        try:
            cnx = mysql.connector.connect(**mysql_info)
            x = cnx.cursor()
            table = get_table("queue")
            ts = time.time()
            timestamp = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
            #TODO: Set priority dynamically somehow
            query = "INSERT INTO {} VALUES (NULL, '{}', '{}', '{}', '{}', '{}', '{}', '{}', {}, '{}', '{}');".format(
                table,
                workflow["user"],
                filename,
                json.dumps(workflow),
                "0",
                "0",
                "still unknown #todo",
                timestamp,
                "DEFAULT(finished_date)",
                "1",
                ""
            )
            try:
                x.execute(query)
                queue_index = x.lastrowid
                cnx.commit()
                print(">>> Successfully created mysql entry")

               
            except Exception as e:
                cnx.rollback()
                print(">>> Error when inserting row" + str(e))
            
        except mysql.connector.Error as err:
            print(">>> MYSQL error ({})".format(err))
        else:
            cnx.close()
        
        print(">>> Successfully parsed received workflow")
        print("Generating dask script")
        dsk_script_str = DaskScriptGenerator.generate_dask_script(workflow, service_hub_path, library_path, queue_index)
        print("Saving dask script")

        with open(path, "w") as text_file:
            text_file.write(dsk_script_str)
        
        print(">>> Successfully generated dask script to {}".format(path))

        producer = KafkaProducer(bootstrap_servers=kafka_server_address)
        topic = Config.get_event("workflow_workflow-processed")
        producer.send(topic, str(queue_index))

        producer.flush()

    except Exception as e:
        print(str(e))
    
    print("-------- Finished --------")
def main(argv):
    # Explicitly refer to the global variables to avoid creating a local variable that shadows the outer scope.
    global args

    # STEP 1: Setup the logging so we can start to log debug info right away.
    setupLogging()
    logger.debug(
        "\n\n\n--------------------[ JTI-GRPC-CLIENT: BEGIN EXECUTION ]--------------------"
    )

    # STEP 2: Parse the argument list with which this script was invoked and set global variable 'args'.
    logger.debug("Number of arguments used in script invocation: " +
                 str(len(argv)) + "\n")
    logger.info("Full argument list: " + str(argv) + "\n")
    #args = parseArguments()

    # STEP 3: Create an insecure channel to the gRPC server running on the router, and use the channel to create
    # the client stub
    device_ip_and_port = DEVICE_IP + ":" + DEVICE_PORT
    logger.info("Creating insecure channel to " + device_ip_and_port + " ...")
    grpc_channel = grpc.insecure_channel(device_ip_and_port)
    stub = agent_pb2_grpc.OpenConfigTelemetryStub(grpc_channel)
    logger.info("... Done!\n")

    # STEP 4: Get the Telemetry Agent operational states (this is one of the methods exposed via agent.proto).
    # As per the .proto file, use 0xFFFFFFFF for all subscription identifiers including agent-level operational stats.
    if INVOKE_GET_OPERATIONAL_STATE_FLAG:
        logger.info("Invoking 'getTelemetryOperationalState()' ...")
        get_oper_state_request = agent_pb2.GetOperationalStateRequest(
            subscription_id=0xFFFFFFFF,
            verbosity=agent_pb2.VerbosityLevel.Value(VERBOSITY.BRIEF))
        get_oper_state_response = stub.getTelemetryOperationalState(
            get_oper_state_request)
        logger.info(str(get_oper_state_response))
        logger.info("... Done!\n")

    # STEP 5: Return the set of data encodings supported by the device for telemetry data.
    if INVOKE_GET_DATA_ENCODINGS_FLAG:
        logger.info("Invoking 'getDataEncodings()' ...")
        get_data_encoding_request = agent_pb2.DataEncodingRequest()
        get_data_encoding_reply = stub.getDataEncodings(
            get_data_encoding_request)
        logger.info(str(get_data_encoding_reply))
        logger.info("... Done!\n")

    # STEP 6: Get the list of current telemetry subscriptions from the target (this is one of the methods exposed via agent.proto).
    # As per the .proto file, use 0xFFFFFFFF for all subscription identifiers.
    if INVOKE_GET_SUBSCRIPTIONS_FLAG:
        logger.info("Invoking 'getTelemetrySubscriptions()' ...")
        get_subscriptions_request = agent_pb2.GetSubscriptionsRequest(
            subscription_id=0xFFFFFFFF)
        get_subscriptions_reply = stub.getTelemetrySubscriptions(
            get_subscriptions_request)
        logger.info(str(get_subscriptions_reply))
        logger.info("... Done!\n")

    # STEP 7: The telemetrySubscribe() method requires a SubscriptionRequest object as an input, which in turn requires
    # a SubscriptionInput object and a list of Path objects as input ... assemble these various objects.

    # Setup Collector ...
    collector = agent_pb2.Collector(address=COLLECTOR_ADDRESS,
                                    port=COLLECTOR_PORT)
    logger.debug("Value of 'collector': " + str(collector))

    # Use Collector to setup SubscriptionInput ...
    subscription_input = agent_pb2.SubscriptionInput(
        collector_list=[collector])
    logger.debug("Value of 'subscription_input':\n" + str(subscription_input))

    # Setup Path ...
    #path = agent_pb2.Path(path="/junos/system/linecard/interface/", sample_frequency=5000)
    #path = agent_pb2.Path(path="/interfaces/interface[name='ge-0/0/0']/", sample_frequency=5000)
    #path = agent_pb2.Path(path="/interfaces/interface[name='ge-0/0/0']/state/", sample_frequency=5000)
    #path = agent_pb2.Path(path="/junos/events", sample_frequency=0)
    #path = agent_pb2.Path(path="/junos/events/event[id=\'UI_COMMIT\']", sample_frequency=0)
    #path = agent_pb2.Path(path="/components/", sample_frequency=5000)

    ## Multiple Sensor Subscriptions ...
    path1 = agent_pb2.Path(
        path="/interfaces/interface[name='ge-0/0/0']/state/",
        sample_frequency=5000)
    path2 = agent_pb2.Path(path="/junos/events/event[id=\'UI_COMMIT\']",
                           sample_frequency=0)
    #path2 = agent_pb2.Path(path="/junos/events", sample_frequency=0)

    # Use Path(s) to setup path_list ...
    #path_list = [path]
    path_list = [path1, path2]
    logger.debug("Value of 'path_list':\n" + str(path_list))

    # Use SubscriptionInput and path_list to setup SubscriptionRequest ...
    subscription_request = agent_pb2.SubscriptionRequest(
        input=subscription_input, path_list=path_list)
    logger.info("Value of 'subscription_request':\n" +
                str(subscription_request))

    # Define Kafka Endpoint
    producer = None
    if SOUTHBOUND_KAFKA_FLAG:
        bootstrap_server = KAFKA_IP + ":" + KAFKA_PORT
        logger.info("Value of 'bootstrap_server':" + bootstrap_server)

        # Connect to Kafka as a Producer
        logger.info("Connecting to Kafka as a Producer ...")
        producer = KafkaProducer(bootstrap_servers=bootstrap_server)
        logger.info("... Done!\n")

    # Launch telemetry subscription request ...
    for message in stub.telemetrySubscribe(subscription_request):
        # Print each telemetry message to console.
        print(message)

        # Parse message and assemble contents in JSON format in preparation for Kafka push.
        data = {}
        data['system_id'] = message.system_id
        data['component_id'] = message.component_id
        data['sub_component_id'] = message.sub_component_id
        data['path'] = message.path
        data['sequence_number'] = message.sequence_number
        data['timestamp'] = message.timestamp

        # The telemetry data returned is a list of key-value pairs, where the value can be one of the following
        # possible values: double_value, int_value, uint_value, sint_value, bool_value, str_value, bytes_value.
        kv_pairs = []
        for kv in message.kv:
            key = kv.key
            value_type = kv.WhichOneof('value')

            if value_type == "double_value":
                kv_pairs.append({key: kv.double_value})
            if value_type == "int_value":
                kv_pairs.append({key: kv.int_value})
            if value_type == "uint_value":
                kv_pairs.append({key: kv.uint_value})
            if value_type == "sint_value":
                kv_pairs.append({key: kv.sint_value})
            if value_type == "bool_value":
                kv_pairs.append({key: kv.bool_value})
            if value_type == "str_value":
                kv_pairs.append({key: kv.str_value})
            if value_type == "bytes_value":
                kv_pairs.append({key: kv.bytes_value})

            data['kv_pairs'] = kv_pairs

        #data['key'] = 'value'
        # Encode the data in JSON and pretty-print it before firing it off to Kafka.
        json_data = json.dumps(data, indent=3)

        if SOUTHBOUND_KAFKA_FLAG:
            # Publish message to Kafka bus.
            # Route to an appropriate Kafka topic, based on the telemetry subscription path.
            logger.info("Pushing message to Kafka ...")
            if JTI_INTERFACES_SENSOR_SUBSTRING in message.path:
                #producer.send(KAFKA_TOPIC_JUNIPER, json_data)
                producer.send(KAFKA_TOPIC_JUNIPER_INTERFACES, json_data)
                #producer.send('juniper', json_data)
            elif JTI_SYSLOG_SENSOR_SUBSTRING in message.path:
                #producer.send(KAFKA_TOPIC_JUNIPER, json_data)
                producer.send(KAFKA_TOPIC_JUNIPER_SYSLOG, json_data)
                #producer.send("juniper", json_data)
            else:
                producer.send(KAFKA_TOPIC_JUNIPER, json_data)

            # Block until all async messages are sent.
            # Failing to do so may result in the Producer being killed before messages are actually delivered!
            # Note that send() operates asynchronously!
            producer.flush()
            logger.info("... Done!\n")
Ejemplo n.º 60
0
	def produce_topic(self,topic_name,receiver_id,value,kafka_id):
	    producer = KafkaProducer(retries=no_of_retry,bootstrap_servers=kafka_id,key_serializer=lambda m: json.dumps(m).encode('utf-8'),value_serializer=lambda m: json.dumps(m).encode('utf-8'))
	    producer.send(topic_name,key=receiver_id,value=value)
	    producer.flush()