Example #1
0
class SimpleProducer(BaseStreamProducer):
    def __init__(self, location, enable_ssl, cert_path, topic, compression, **kwargs):
        self._location = location
        self._topic = topic
        self._compression = compression
        self._create(enable_ssl, cert_path, **kwargs)

    def _create(self, enable_ssl, cert_path, **kwargs):
        max_request_size = kwargs.pop('max_request_size', DEFAULT_MAX_REQUEST_SIZE)
        kwargs.update(_prepare_kafka_ssl_kwargs(cert_path) if enable_ssl else {})
        self._producer = KafkaProducer(bootstrap_servers=self._location,
                                       retries=5,
                                       compression_type=self._compression,
                                       max_request_size=max_request_size,
                                       **kwargs)

    def send(self, key, *messages):
        for msg in messages:
            self._producer.send(self._topic, value=msg)

    def flush(self):
        self._producer.flush()

    def close(self):
        self._producer.close()
Example #2
0
def test_end_to_end(kafka_broker):
    connect_str = 'localhost:' + str(kafka_broker.port)
    producer = KafkaProducer(bootstrap_servers=connect_str,
                             max_block_ms=10000,
                             value_serializer=str.encode)
    consumer = KafkaConsumer(bootstrap_servers=connect_str,
                             group_id=None,
                             consumer_timeout_ms=10000,
                             auto_offset_reset='earliest',
                             value_deserializer=bytes.decode)

    topic = random_string(5)

    for i in range(1000):
        producer.send(topic, 'msg %d' % i)
    producer.flush()
    producer.close()

    consumer.subscribe([topic])
    msgs = set()
    for i in range(1000):
        try:
            msgs.add(next(consumer).value)
        except StopIteration:
            break

    assert msgs == set(['msg %d' % i for i in range(1000)])
    def run():
        parser = get_args_parser()
        try:
            parse_result = parser.parse_args()

            topic_name = parse_result.topic
            num_records = parse_result.num_records
            record_size = parse_result.record_size
            producer_props = parse_result.producer_config

            props = {}
            for prop in producer_props:
                k, v = prop.split('=')
                try:
                    v = int(v)
                except ValueError:
                    pass
                props[k] = v

            producer = KafkaProducer(**props)
            record = bytes(bytearray(record_size))
            stats = Stats(num_records, 5000)

            for i in xrange(num_records):
                send_start_ms = get_time_millis()
                future = producer.send(topic=topic_name, value=record)
                future.add_callback(stats.next_completion(
                        send_start_ms, record_size, stats))

            producer.close()
            stats.print_total()
        except Exception as e:
            exc_info = sys.exc_info()
            traceback.print_exception(*exc_info)
            sys.exit(1)
Example #4
0
def sendSingleMsg2Kafka(msg):
    if not msg:
        return
    producer = KafkaProducer(bootstrap_servers='10.128.184.167:9092')
    producer.send('topic_lpr', msg.encode('utf8'))
    producer.flush()
    producer.close(timeout=5)
Example #5
0
def produce_to_bruce(schema, args, config):
    topic = config['kafka']['topic']

    if args.partition_count:
        partition_count = args.partition_count
    else:
        print 'fetch partition info for topic ' + topic
        producer = KafkaProducer(bootstrap_servers = config['kafka']['brokers'])
        partition_count = 1 + max(producer.partitions_for(topic))
        producer.close()

    socket = bruce.open_bruce_socket()

    # batching socket send
    buff = []

    def flush_buff():
        for msg in buff:
            socket.sendto(msg, '/var/run/bruce/bruce.socket')
        del buff[:]

    def f_produce(topic, partition, key, value):
        if len(buff) < 1000:
            buff.append(bruce.create_msg(partition, topic, bytes(key), bytes(value)))
        else:
            flush_buff()

    try:
        bootstrap(f_produce, partition_count, schema, args.database, args.table, config)
        flush_buff()
    except KeyboardInterrupt:
        sys.exit(1)
    finally:
        socket.close()
    def send_message(self,message,topic_partition):

        self._logger.info("Sending message to: Topic: {0} Partition:{1}".format(self._topic,topic_partition))
        kafka_brokers = '{0}:{1}'.format(self._server,self._port)             
        producer = KafkaProducer(bootstrap_servers=[kafka_brokers],api_version_auto_timeout_ms=3600000)
        future = producer.send(self._topic,message,partition=topic_partition)
        producer.flush()
        producer.close()
Example #7
0
    def run(self):
        producer = KafkaProducer(bootstrap_servers='localhost:9092')

        while not self.stop_event.is_set():
            producer.send('my-topic', b"test")
            producer.send('my-topic', b"\xc2Hola, mundo!")
            time.sleep(1)

        producer.close()
    def run(self):
        producer = KafkaProducer(bootstrap_servers='localhost:9092')

        while not self.stop_event.is_set():
            print("Sending message from: " + str(threading.get_ident()))
            producer.send('my-topic', b"test")
            producer.send('my-topic', b"\xc2Hola, mundo!")
            time.sleep(0.2)

        producer.close()
Example #9
0
 def _push(self, payload):
     if super(KafkaService, self)._push(payload):
         LOGGER.info("Pushing payload to kafka: %s", str(payload))
         brokers = self.destination_config['brokers'].split(',')
         topic = self.destination_config['topic']
         kafka_producer = KafkaProducer(bootstrap_servers=brokers)
         for values in payload:
             kafka_producer.send(topic, str(values).encode('utf-8'))
         kafka_producer.flush(3)
         kafka_producer.close(3)
     else:
         LOGGER.warn("Payload is none, nothing to push.")
Example #10
0
def producer_():
    from kafka import KafkaProducer
    producer = KafkaProducer(bootstrap_servers='192.168.1.101:9092')
    msg_dict = {'msg': 'Hello World',
                'sleep_time': 10,
                'db_config': 'retry'
                }

    msg = json.dumps(msg_dict)

    producer.send('test_rhj',msg,partition=0)
    producer.close()
Example #11
0
def main(directory, topic, byline):
    #get a hdfs object
    myHdfs = hdfs.hdfs()
    myPath = myHdfs.walk(directory)
    
    # a global variable
    global producer 

    # Get a producer object
    producer = KafkaProducer(bootstrap_servers=["node4:6667"], compression_type='gzip', acks=1, retries=2)
    
    for myfile in myPath:
        #Skip directory recursive
        if myfile["kind"] == "directory":
            logger.debug("ignoring %s" %(myfile))
            continue
        
        elif myfile["kind"] == "file":
            pass
        
        else:
            raise Exception, "Unknown kind %s for %s" %(myfile["kind"], myfile["name"])
            
        #Skip name in particoular
        if "_SUCCESS" in myfile["name"] or "_temporary" in myfile["name"]:
            logger.debug("ignoring %s" %(myfile))
            continue
        
        #Skip 0 size files
        if myfile["size"] == 0:
            logger.debug("ignoring %s" %(myfile))
            continue
        
        logger.info("Working on %s" %(myfile["name"]))

        #call processChunk if I want to submit chunk
        if byline is False:
            processChunk(myfile, topic)
            
        else:
            #Otherwise submit line by line
            processLine(myfile, topic)

        #with file open
        logger.info("Completed %s" %(myfile["name"]))
        
        #sleep some time
        time.sleep(1)
                    
    # for all files in HDFS
    producer.close()
Example #12
0
def produce_to_kafka(schema, args, config):
    topic = config['kafka']['topic']
    producer = KafkaProducer(bootstrap_servers = config['kafka']['brokers'])

    def f_produce(topic, partition, key, value):
        producer.send(topic, key = key, value = value, partition = partition)

    partition_count = 1 + max(producer.partitions_for(topic))
    try:
        bootstrap(f_produce, partition_count, schema, args.database, args.table, config)
    except KeyboardInterrupt:
        sys.exit(1)
    producer.flush()
    producer.close()
Example #13
0
class KafkaPublisher(
        AbstractPublisher, ):
    """
    使用kafka作为中间件
    """

    # noinspection PyAttributeOutsideInit
    def custom_init(self):
        self._producer = KafkaProducer(
            bootstrap_servers=frame_config.KAFKA_BOOTSTRAP_SERVERS)
        self._admin_client = KafkaAdminClient(
            bootstrap_servers=frame_config.KAFKA_BOOTSTRAP_SERVERS)
        try:
            self._admin_client.create_topics(
                [NewTopic(self._queue_name, 10, 1)])
            # admin_client.create_partitions({self._queue_name: NewPartitions(total_count=16)})
        except TopicAlreadyExistsError:
            pass
        except Exception as e:
            self.logger.exception(e)
        atexit.register(self.close)  # 程序退出前不主动关闭,会报错。

    def concrete_realization_of_publish(self, msg):
        # noinspection PyTypeChecker
        # self.logger.debug(msg)
        # print(msg)
        self._producer.send(
            self._queue_name,
            msg.encode(),
        )

    def clear(self):
        self.logger.warning('还没开始实现 kafka 清空 消息')
        # self._consumer.seek_to_end()
        # self.logger.warning(f'将kafka offset 重置到最后位置')

    def get_message_count(self):
        # return -1 # 还没找到获取所有分区未消费数量的方法 。
        # print(self._admin_client.list_consumer_group_offsets('frame_group'))
        # print(self._admin_client.describe_consumer_groups('frame_group'))
        return -1

    def close(self):
        self._producer.close()

    def _at_exit(self):
        self._producer.flush()
        super()._at_exit()
Example #14
0
class KafkaTweetSerializer:

    _producer = None

    def __init__(self, host='localhost', port='9092'):
        kafka_server = "{0}:{1}".format(host, str(port))
        self._producer = KafkaProducer(bootstrap_servers=[kafka_server],
                                       value_serializer=lambda v: json.dumps(v).encode('utf-8'))

    def write(self, message):
        self._producer.send(topic='tweets', value=message)
        self._producer.flush()
        print "Tweet!"

    def end(self):
        self._producer.close()
class SessionProducer(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)
        self.producer = KafkaProducer(bootstrap_servers=['localhost:9092'])


    def close(self):
        self.producer.close()

    def run(self):
        for session_file in glob.glob('./entree/session/session.*'):
            session = Utilities.getSessionData(session_file)
            print(session)
            msg = json.dumps(session).encode('utf-8')
            self.producer.send('session_data', msg)
            time.sleep(1)
class Producer:
    def __init__(self, bootstrap_servers):
        self.producer = KafkaProducer(bootstrap_servers=bootstrap_servers)

    def send(self, topic, key=None, value=None):
        try:
            key = key.encode() if key else None
            value = value.encode() if value else None
            self.producer.send(topic, key=key, value=value)
        except Exception:
            raise
        finally:
            self.producer.flush()

    def close(self):
        self.producer.close()
Example #17
0
    def run(self, topic_name):

        producer = KafkaProducer(bootstrap_servers=self.bootstrap_server,
                                 acks=self.ack,
                                 retries=self.retries,
                                 linger_ms=self.linger_ms,
                                 key_serializer=str.encode,
                                 value_serializer=str.encode)

        for key in range(10):
            # Since we specified 'key' here value will be written in that key.
            producer.send(topic_name,
                          key=str(key),
                          value="message sent %s" % key)

        producer.close()
Example #18
0
def print_lines(filepath, kafka_topic=None):
    pr = print
    producer = None

    if kafka_topic:
        from kafka import KafkaProducer
        producer = KafkaProducer(bootstrap_servers="kafka:9092")
        pr = lambda line: producer.send(kafka_topic, line.encode('utf-8'))

    try:
        for line in generate_lines(filepath):
            pr(line)
            time.sleep(0.2)
    finally:
        if producer:
            producer.close()
    def run(self):
        #Bootstraps an instance of a Kafka producer.
        #Initializes the producer and identifies the docker server.
        #kafka-spotify is listed in /etc/hosts with the ip of the container
        producer = KafkaProducer(bootstrap_servers='kafka-spotify:9092')

        #loop until the thread is stopped by checking the stop event
        while not self.stop_event.is_set():
            #Send two messages of type binary to the 'test' Topic
            producer.send('test', b"test")
            producer.send('test', b"Hola, mundo!")
            #Sleep for 3 seconds
            time.sleep(3)

        #Close the TCP stream to Kafka
        producer.close()
Example #20
0
 def produce(self):
     producer = KafkaProducer(bootstrap_servers='10.110.87.202:9092')
     msg_dict = {
         "sleep_time": 10,
         "db_config": {
             "database": "test_1",
             "host": "xxxx",
             "user": "******",
             "password": "******"
         },
         "table": "msg",
         "msg": "Hello World"
     }
     msg = json.dumps(msg_dict)
     producer.send('test_rhj', msg, partition=0)
     producer.close()
Example #21
0
def Producer():
    # producer = KafkaProducer(bootstrap_servers ='10.42.2.92:9092')
    producer = KafkaProducer(
        bootstrap_servers='10.42.2.92:9092',
        value_serializer=lambda v: json.dumps(v).encode('utf-8'))
    print('send to kafka start!')
    # msg = json.dumps(msg1)
    # print type(msg)
    # msg = json.dumps(mess)
    # print type(msg)
    # producer.send(topic, msg.encode())
    producer.send(topic, mess)
    # producer.send(topic, msg1)
    time.sleep(1)
    print('send to kafka finished!')
    producer.close()
    def transfer_msg(self, msg):
        _vcenter_mq_url = "{}:{}".format(
            self._vcenter_cfg["message-queue"]["ipaddr"],
            self._vcenter_cfg["message-queue"]["port"])
        _vcenter_mq_topic = self._vcenter_cfg["message-queue"]["topic-map"][
            "flow.measure"]

        try:
            encoded_msg = json.dumps(msg).encode('utf-8')
            producer = KafkaProducer(bootstrap_servers=[_vcenter_mq_url])
            producer.send(_vcenter_mq_topic, encoded_msg)
            producer.close()

        except NoBrokersAvailable as noBrokerExt:
            self._logger.error(
                "Kafka Broker in Security Post is not accessible")
Example #23
0
 def send_json_kafka(self):
     producer = KafkaProducer(
         value_serializer=lambda v: json.dumps(v).encode('utf-8'),
         bootstrap_servers=self.bstrap_servers)
     for i in range(10000):
         time.sleep(1)
         data = {
             "id": i,
             "name": "zhangsan" + str(i),
             "sex": "m",
             "age": 30,
             "dates": timestamp_now()
         }
         log.info("往kafka输入的data:%s", data)
         producer.send(self.json_topic, data)
     producer.close()
def send_goal_event(timestamp: int, topic: str):
    # Hacky
    # TODO: Fix this
    # Create producer for the kafka topic do get ready to publish
    kafka_producer = KafkaProducer(bootstrap_servers=KAFKA_SERVER,
                                   api_version=(2, 5, 0))

    payload = dict(timestamp=timestamp)

    # JSON to bytearray
    payload_bytes = json.dumps(payload).encode('utf-8')

    print('Sending payload:', payload, payload_bytes)
    kafka_producer.send(topic, payload_bytes)

    kafka_producer.close()
Example #25
0
def shutdown_hook(producer: KafkaProducer):
    """
    a shutdown hook to be called before the shutdown
    """
    try:
        logger.info("Flushing pending message to kafka, timeout is set to 10s.")
        producer.flush(10)
        logger.info("Flushed flushing pending message to kafka")
    except KafkaError as kafka_error:
        logger.warning(f"Flushed to flush pending message to kafka, caused by: {kafka_error}")
    finally:
        try:
            logger.info("Closing kafka connection.")
            producer.close()
        except Exception as e:
            logger.warning(f"Failed to close kafka connection, caused by {e}")
def produce(brokers:list,topic:str):

    producer = KafkaProducer(bootstrap_servers=brokers,
                            key_serializer=string_to_bytes,
                            value_serializer=json_serialization)

    i = 0
    while i<1000:
        sleep(1)
        key = "message_key_" + str(i)
        value = {"id": i, "date": 100000 *(2+i**3), "info": "sensor_" + str(i)}
        print("Message Produced: key = {} value = {}".format(key, value))
        producer.send(topic=topic, key=key, value=value)
        i += 1
    producer.flush()
    producer.close()
Example #27
0
class Kafka:
    def __init__(self, hosts, topicName):
        self.hosts = hosts
        self.topic = topicName
        self.kafkaProducer = KafkaProducer(bootstrap_servers=self.hosts)

    def producer(self, value, key=None, sleepTime=1):
        if key is None:
            key = str(datetime.now()).encode()
        self.kafkaProducer.send(self.topic, value, key)
        print("%s:插入kafka成功" % datetime.now())
        time.sleep(sleepTime)

    # 生产者关闭
    def producerClose(self):
        self.kafkaProducer.close()
Example #28
0
class KafkaProduceServer(object):
    def __init__(self, topic, server):
        if type(server) != list:
            server = [server]
        self._topic=topic
        self._producer = KafkaProducer(bootstrap_servers=server,value_serializer=lambda m: json.dumps(m).encode('ascii'))
    def getProducer(self):
        return self._producer
    def sendMsg(self,msg):
        self._producer.send(self._topic,msg)
        self._producer.flush()
    def sendJson(self,key,json):
        self._producer.send(self._topic,key=key,value=json)
        self._producer.flush()
    def close(self):
        self._producer.close()
Example #29
0
class KafkaHandler(logging.Handler):
    """Class to instantiate the kafka logging facility."""
    def __init__(self, hostlist, topic='logs', tls=None):
        """Initialize an instance of the kafka handler."""
        logging.Handler.__init__(self)
        self.producer = KafkaProducer(
            bootstrap_servers=hostlist,
            value_serializer=lambda v: json.dumps(v).encode('utf-8'),
            linger_ms=10)
        self.topic = topic
        self.record = None

    def emit(self, record):
        """Emit the provided record to the kafka_client producer."""
        # drop kafka logging to avoid infinite recursion
        if 'kafka.' in record.name:
            return

        try:
            # apply the logger formatter
            msg = self.format(record)

            self.producer.send(
                self.topic, {
                    't': int(time.time()),
                    'source': record.name,
                    'level': record.levelname,
                    'message': msg
                })
            self.flush(timeout=1.0)
        except Exception:
            logging.Handler.handleError(self, record)

    def flush(self, timeout=None):
        """Flush the objects."""
        self.producer.flush(timeout=timeout)

    def close(self):
        """Close the producer and clean up."""
        self.acquire()
        try:
            if self.producer:
                self.producer.close()

            logging.Handler.close(self)
        finally:
            self.release()
Example #30
0
    class KafkaP:
        """
        生产模块:根据不同的key,区分消息
        """
        def __init__(self, bootstrap_servers, compression_type='gzip'):
            self.bootstrap_servers = bootstrap_servers
            self.retries = 3
            self.ack = 0
            self.linger_ms = 0
            self.compression_type = compression_type
            if self.compression_type is None:
                self.producer = KafkaProducer(
                    bootstrap_servers=self.bootstrap_servers,
                    retries=self.retries,
                    acks=self.ack,
                    linger_ms=self.linger_ms,
                )
            else:
                self.producer = KafkaProducer(
                    bootstrap_servers=self.bootstrap_servers,
                    retries=self.retries,
                    acks=self.ack,
                    linger_ms=self.linger_ms,
                    compression_type=self.compression_type)

        def send_data(self, message, topic, key=None):
            self.producer.send(topic=topic, key=key, value=message)
            # print message

        def reconnection_producer(self):
            if self.compression_type is None:
                self.producer = KafkaProducer(
                    bootstrap_servers=self.bootstrap_servers,
                    retries=self.retries,
                    acks=self.ack,
                    linger_ms=self.linger_ms)
            else:
                self.producer = KafkaProducer(
                    bootstrap_servers=self.bootstrap_servers,
                    retries=self.retries,
                    acks=self.ack,
                    linger_ms=self.linger_ms,
                    compression_type=self.compression_type)

        def close_producer(self):
            self.producer.flush()
            self.producer.close()
def process_SEC_rss(item):
    index_rss = 'http://www.sec.gov/Archives/edgar/monthly/xbrlrss-{}.xml'.format(
        item)
    producer = KafkaProducer(bootstrap_servers=kafka_url)
    rss_feed = urllib2.urlopen(index_rss)
    index_data = rss_feed.read()
    rss_feed.close()

    index_doc = xmltodict.parse(index_data)
    item_list = index_doc['rss']['channel']['item']
    msg_count = 0
    for entry in item_list:
        formType = entry['edgar:xbrlFiling']['edgar:formType']
        filingInfo = entry['edgar:xbrlFiling']

        if (formType == '10-Q' or formType == '10-K'):
            newRow = {
                'companyName': get_value(filingInfo, 'edgar:companyName'),
                'guid': get_value(entry, 'guid'),
                'xml_filing': index_rss,
                'pubDate': get_value(entry, 'pubDate'),
                'formType': formType,
                'filingDate': get_value(filingInfo, 'edgar:filingDate'),
                'cikNumbver': get_value(filingInfo, 'edgar:cikNumber'),
                'accessionNumber': get_value(filingInfo,
                                             'edgar:accessionNumber'),
                'fileNumber': get_value(filingInfo, 'edgar:fileNumber'),
                'filingInfo': get_value(filingInfo, 'edgar:period'),
                'fiscalYearEnd': get_value(filingInfo, 'edgar:fiscalYearEnd'),
            }
            #           cols = newRow.keys()
            #           vals = [newRow[x] for x in cols]
            #            vals_str_list = ["%s"] * len(vals)
            #            vals_str = ", ".join(vals_str_list)
            try:
                jsec = json.dumps(newRow)
                producer.send(topic_name, jsec)
                producer.flush()

                msg_count = msg_count + 1

                print("Added {} sec filings".format(msg_count))
            except e:
                print "Exception encountered {e}"
    metrics = producer.metrics()
    print metrics
    producer.close()
Example #32
0
class KafkaEventHandler(EventHandlerInterface):
    """This class implements an event record listener, that will forward Json-objects to a Kafka queue."""
    def __init__(self, analysis_context, topic, options):
        self.analysis_context = analysis_context
        self.options = options
        self.topic = topic
        self.producer = None
        self.kafka_imported = False

    def receive_event(self, _event_type, _event_message, _sorted_log_lines,
                      event_data, _log_atom, event_source):
        """Receive information about a detected event in json format."""
        if hasattr(event_source, 'output_event_handlers') and event_source.output_event_handlers is not None and self not in \
                event_source.output_event_handlers:
            return True
        component_name = self.analysis_context.get_name_by_component(
            event_source)
        if component_name in self.analysis_context.suppress_detector_list:
            return True
        if self.kafka_imported is False:
            try:
                from kafka import KafkaProducer
                from kafka.errors import KafkaError
                self.producer = KafkaProducer(
                    **self.options, value_serializer=lambda v: v.encode())
                self.kafka_imported = True
            except ImportError:
                msg = 'Kafka module not found.'
                logging.getLogger(AminerConfig.DEBUG_LOG_NAME).error(msg)
                print('ERROR: ' + msg, file=sys.stderr)
                return False
        if not isinstance(event_data, str) and not isinstance(
                event_data, bytes):
            msg = 'KafkaEventHandler received non-string event data. Use the JsonConverterHandler to serialize it first.'
            logging.getLogger(AminerConfig.DEBUG_LOG_NAME).warning(msg)
            print('WARNING: ' + msg, file=sys.stderr)
            return False
        try:
            self.producer.send(self.topic, event_data)
        except KafkaError as err:
            msg = str(err)
            logging.getLogger(AminerConfig.DEBUG_LOG_NAME).error(msg)
            print("Error: " + msg, file=sys.stderr)
            self.producer.close()
            self.producer = None
            return False
        return True
Example #33
0
def get_stats(UDP_IP, UDP_PORT):

    # The following strings are the linux commands
    cpu_cmd = 'grep cpu /proc/stat | awk \'{usage=($2+$4)*100/($2+$4+$5)} END {print usage "%"}\''
    mem_cmd = 'free -tmh | grep Mem | tr -s \' \' | awk \'{print $3","$7","$2}\''
    network_cmd = 'cat /proc/net/dev | tr -s \' \' | grep eth0 | awk \'{print $2","$10}\''
    ps_cmd = 'ps'

    # Call cmd_output function to get the output of the executed commands
    cpu = cmd_output(cpu_cmd)
    mem = cmd_output(mem_cmd)
    network = cmd_output(network_cmd)
    ps = cmd_output(ps_cmd)

    parse_mem = mem.split(",")
    used_mem = parse_mem[0]
    available_mem = parse_mem[1]
    total_mem = parse_mem[2]

    parse_network = network.split(",")
    rcv_bytes = parse_network[0]
    trans_bytes = parse_network[1]
    # build json object
    data = {}
    data['Date'] = datetime.datetime.now().strftime("%y-%m-%d %H:%M:%S")
    data['CPU'] = cpu
    data['Memory_used'] = used_mem
    data['Memory_available'] = available_mem
    data['Network_received_bytes'] = rcv_bytes
    data['Network_transmitted_bytes'] = trans_bytes
    data['ps_command_output'] = str(ps)

    print(str(data))

    json_data = json.dumps(data)

    # send data to Kafka consumer
    try:
        kafka_server = UDP_IP + ":" + str(UDP_PORT)
        producer = KafkaProducer(bootstrap_servers=kafka_server)
        producer.send('stats', json_data.encode('utf8'))
        producer.flush()
        producer.close()
    except Exception, e:
        print("Error occurred during data transmission process!")
        print(str(e))
        pass
Example #34
0
def push_to_pandas(df):
	import pygeohash
	from cassandra.cluster import Cluster
	from kafka import KafkaProducer
	import  timeit

	cluster = Cluster()
        session = cluster.connect('xweather')

	producer = KafkaProducer(bootstrap_servers=['vm1:9092'])
	name=multiprocessing.current_process().name
	#df = pd.read_csv(filename)
	df1= df[['id','lat','lon','src','elev','timezone','tzoffset']].drop_duplicates()
	df1.src.fillna('NA')
	# Adding Geohash Id
	df1['geohash_id']=df.apply(lambda row:pygeohash.encode(row['lat'],row['lon']),axis=1)

	
	#Now loop through the Dataframe
	for row in df1.itertuples():
	  j = ','.join((row[8],str(row[1]),str(row[5]),row[8][:3],str(row[2]),str(row[3]),str(row[4]),str(row[6]),str(row[7])))
	  future = producer.send('topic-weather-stations',j)
 	  
	print('Completed insert into weather stations',name)
	
	#Now to the facts
	#Remove the descriptive columns
	df.drop(df.columns[[1,2,3,4,5,6]],axis=1,inplace=True)
	
	#Unpivot the dataset
	df=pd.melt(df,id_vars=['id','timestamp','dateTime'])
	df=df.dropna()
	# Kafka it
	ctr =0;
	producer = KafkaProducer(bootstrap_servers=['vm1:9092'],batch_size=20000,linger_ms=50,buffer_memory=952108864)
	#producer = KafkaProducer(bootstrap_servers=['vm1:9092'])
	start_time = timeit.default_timer()
	for row in df.itertuples():
	   k=list(row)
	   k=k[1:]
	   j= ','.join(str(x) for x in k)
	   future = producer.send('topic-weather-data',j)
           ctr+=1
        print('Producer timing is ', name,timeit.default_timer() - start_time,'Rows:',ctr)
	producer.flush()
	producer.close()
Example #35
0
class KafkaProducerPipeline(object):
    def __init__(self, kafka_bootstrap_server):
        self.kafka_bootstrap_server = []
        self.kafka_bootstrap_server.append(kafka_bootstrap_server)
        self.collection_name = 'articles'
        self.encoder = ScrapyJSONEncoder()

    @classmethod
    def from_crawler(cls, crawler):
        # pull in information from settings.py
        return cls(kafka_bootstrap_server=crawler.settings.get(
            'KAFKA_BOOTSTRAP_SERVER'), )

    def open_spider(self, spider):
        print("spider name: ", spider.name)
        # initializing py-Kafka producer
        self.producer = KafkaProducer(
            bootstrap_servers=self.kafka_bootstrap_server)

        print("kafka_bootstrap_server: ", self.kafka_bootstrap_server)
        if hasattr(spider, 'collection_name'):
            print("spider collection_name: ", spider.collection_name)
            self.collection_name = spider.collection_name

    def close_spider(self, spider):
        # clean up when spider is closed
        self.producer.flush(timeout=60)
        self.producer.close(timeout=60)

    def process_item(self, item, spider):
        valid = True
        for data in item:
            if not data:
                valid = False
                raise DropItem("Missing {0}!".format(data))
        if valid:
            print("valid - inside process_item...", item['source'], ': ',
                  item['headline'])
            # self.producer.send('articles', self.encoder.encode(item).encode())
            key = str(ord(item['source'][0])) + str(ord(item['source'][1]))
            self.producer.send('articles',
                               value=self.encoder.encode(item).encode(),
                               key=key.encode())
            self.index += 1
            logging.debug("News item sent by Kafka Producer!")
        return item
Example #36
0
def send_kafka(msg):
    try:
        producer = KafkaProducer(bootstrap_servers=bootstrap_servers,
                                 retries=retries)
    except Exception as e:
        logger.error(e)
        raise Exception("catch an exception when create KafkaProducer")
    try:
        producer.send(topic, msg)
        producer.flush()
        producer.close()

    except Exception as e:
        logger.error(e)
        if producer:
            producer.close()
        raise Exception("catch an exception when send message:%s" % msg)
Example #37
0
def chat_input():
    producer = KafkaProducer(bootstrap_servers=KAFKA_IP)
    try:
        chat_name = request.form['chat_name']
        chat_text = request.form['chat_text']
        chat_time = time.strftime('%d/%m/%Y %H:%M')
        if not chat_name:
            chat_name = 'Anonymous'
        msg_json = json.dumps({
            "chat_name": chat_name,
            "chat_text": chat_text,
            "chat_time": chat_time
        })
        producer.send(topic, msg_json.encode('utf-8'))
        return index()
    finally:
        producer.close()
Example #38
0
def send2Kafka(msgs):
    if not msgs:
        return
    producer = KafkaProducer(bootstrap_servers='10.128.184.167:9092')

    global count
    for msg in msgs:
        tmp = format_msg(msg)
        # print 'Send ==> ', tmp
        producer.send('topic_lpr', tmp.encode('utf8'))
        if count % 100 == 0:
            print u'==>[{}] {}'.format(count, tmp)
            producer.flush()
        count += 1

    producer.flush()
    producer.close(timeout=5)
Example #39
0
 def run(self):
     producer = KafkaProducer(bootstrap_servers=['kafka: 9092'])
     url_col = -1
     i = -1
     with open(self.filename, 'rU') as csvfile:
         reader = csv.reader(csvfile,delimiter=',')
         for row in reader:
             i += 1
             if i == 0:
                 try:
                     url_col = row.index("URL")
                 except ValueError:
                     print("ERROR: The csv must have a column header titled 'URL'")
                     return
                 continue
             producer.send('demo.incoming', '{"url": "'+row[url_col]+'", "appid":"testapp", "crawlid":"' + self.filename + "_" +str(i)+'", "spiderid":"parsing_link", "maxdepth": 2}')
     producer.close()
Example #40
0
def send_data(value):
    producer = KafkaProducer(bootstrap_servers=['testdb.ibuildingsh.com:9092'])

    data = {
        'source_id': '6012283db2296c000167711d',
        'key': 'document-data',
        'value': value
    }

    future = producer.send(
        topic='t-document-data',
        key='document-data'.encode('utf-8'),
        # key=str(datetime.now()).encode('utf-8'),
        value=json.dumps(data).encode('utf-8'),
        partition=0)

    producer.close()
Example #41
0
def test_data():
    producer = KafkaProducer(bootstrap_servers=['testdb.ibuildingsh.com:9092'])
    # , security_protocol="SSL"

    types = ['doc', 'pdf', 'ppt']
    filetypes = ['施工', '运维', '进度', '方案']
    usernames = ['我', '你', '你', '他']

    data = {
        'source_id': '6010c605186e4e0001e27ab1',
        'key': 'test-file',
        'value': []
    }

    for i in range(10):
        cursize = np.random.rand() * 100
        value = {
            'filetype':
            np.random.choice(filetypes),
            'filename':
            str(i + 300000),
            'type':
            np.random.choice(types),
            'sizeMb':
            int(cursize),
            'sizeByte':
            int(cursize * 1024 * 1024),
            'username':
            np.random.choice(usernames),
            'timestamp':
            int(
                datetime(year=2021, month=1,
                         day=np.random.randint(1, 19)).timestamp())
        }
        data['value'].append(value)

    future = producer.send(
        topic='t-file-data-multi',
        key='file-test-multi'.encode('utf-8'),
        # key=str(datetime.now()).encode('utf-8'),
        value=json.dumps(data).encode('utf-8'),
        partition=0)

    # result = future.get(timeout=10)
    # print(data)
    producer.close()
class SenderKafka:
    def __init__(self, topic: str, bootstrap_servers):
        if isinstance(bootstrap_servers, str):
            bootstrap_servers = [bootstrap_servers]

        assert isinstance(bootstrap_servers, list)

        self._producer = KafkaProducer(bootstrap_servers=bootstrap_servers)
        self._closed = False
        self.topic = topic
        self.bootstrap_servers = bootstrap_servers

    def send(self, message, topic=None):
        if self._closed:
            log.error(
                'The sender is closed, create a new one to send a message')
            return None
        if topic is None:
            topic = self.topic
        if isinstance(message, dict):
            try:
                message = json.dumps(message)
            except ValueError:
                log.error(
                    'Could not convert {} to json string'.format(message))
                return None
        if isinstance(message, str):
            try:
                message = message.encode('utf-8')
            except ValueError:
                log.error('Could not convert {} to bytes'.format(message))
                return None
        if isinstance(message, bytes):
            log.debug('Sending {}'.format(message))
            result = self._producer.send(topic, message)
            self._producer.flush()
            return result
        else:
            log.error(
                'The message must be either bytes, utf-8 string or a dict')
            return None

    def close(self):
        if not self._closed:
            self._closed = True
            self._producer.close()
Example #43
0
def send_kafka_method1():
    """
    发送方式一
    发送并忘记(不关注是否正常到达,不对返回结果做处理)
    :return:
    """
    producer = KafkaProducer(bootstrap_servers=BOOTSTRAP_SERVERS)
    start_time = time.time()
    for i in range(0, 10000):
        msg = 'echo %s' % i
        # print(msg)
        future = producer.send(TOPIC, msg.encode(), partition=0)
    # 将缓冲区的全部消息push到broker当中
    producer.flush()
    producer.close()
    time_cost = time.time() - start_time
    print('发送耗时 %s 秒' % time_cost)
def send_message_group():
    producer = KafkaProducer(bootstrap_servers=common.KAFKA_BROKET_LIST)

    for i in range(100):
        key = tobytes(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

        value = tobytes("hadoop" if i % 3 == 0 else "hive")

        print(str(key) + "," + str(value))

        producer.send("group_withwatermark", key=key, value=value)

        time.sleep(2)

    producer.flush()

    producer.close()
Example #45
0
def send2Kafka(msgs):
    if not msgs:
        return
    producer = KafkaProducer(bootstrap_servers='10.128.184.167:9092')

    global count
    for msg in msgs:
        tmp = format_msg(msg)
        #print 'Send ==> ', tmp
        producer.send('topic_taxi', tmp.encode('utf8'))
        if count%2000 == 0:
            print u'==>[{}] {}'.format(count, tmp)
            producer.flush()
        count += 1

    producer.flush()
    producer.close(timeout=5)
def try_send():

    producer = KafkaProducer(bootstrap_servers="ip-172-31-12-78.us-west-1.compute.internal:6667")
    # client = KafkaClient("ip-172-31-12-78.us-west-1.compute.internal:6667")
    # producer = SimpleProducer(client, async=True, batch_send_every_n = 100, batch_send_every_t = 60, random_start=False)
    # producer = SimpleProducer(client)
    # connect_str = 'ip-172-31-12-78.us-west-1.compute.internal:6667'
    # producer = KafkaProducer(bootstrap_servers=connect_str,
    #                         max_block_ms=10000,
    #                         value_serializer=str.encode)

    topic = '2008'
    with open('/home/ec2-user/data/2008.csv') as f:
        for line in f:
            producer.send(topic, line)

    producer.flush()
    producer.close()
Example #47
0
    def setUpClass(cls):
        cls.filename = os.path.join(os.getcwd(), "python_temp.pickle")
        cls.kafka_host = "localhost:9092"
        millis = int(round(time.time() * 1000))
        cls.topic = "test_{}".format(millis)
        kafka_params = {
            "kafkaParam": {"topic": cls.topic, "bootstrap.servers": cls.kafka_host,
                           "group_id": "group-jj",
                           "debug": False},
            "systemParam": {},
            "internalSystemParam": {"stopFlagNum": 3},
            "fitParam": {"alg": "RandomForestClassifier", "debug": "True"}
        }
        pickle.dump(kafka_params, open(cls.filename, "wb"), 2)

        producer = KafkaProducer(bootstrap_servers=cls.kafka_host)
        for i in xrange(10):
            producer.send(cls.topic, pickle.dumps("{}", 2))
        producer.close()
Example #48
0
def test_end_to_end(kafka_broker, compression):

    if compression == 'lz4':
        # LZ4 requires 0.8.2
        if version() < (0, 8, 2):
            return
        # python-lz4 crashes on older versions of pypy
        elif platform.python_implementation() == 'PyPy':
            return

    connect_str = ':'.join([kafka_broker.host, str(kafka_broker.port)])
    producer = KafkaProducer(bootstrap_servers=connect_str,
                             retries=5,
                             max_block_ms=30000,
                             compression_type=compression,
                             value_serializer=str.encode)
    consumer = KafkaConsumer(bootstrap_servers=connect_str,
                             group_id=None,
                             consumer_timeout_ms=30000,
                             auto_offset_reset='earliest',
                             value_deserializer=bytes.decode)

    topic = random_string(5)

    messages = 100
    futures = []
    for i in range(messages):
        futures.append(producer.send(topic, 'msg %d' % i))
    ret = [f.get(timeout=30) for f in futures]
    assert len(ret) == messages
    producer.close()

    consumer.subscribe([topic])
    msgs = set()
    for i in range(messages):
        try:
            msgs.add(next(consumer).value)
        except StopIteration:
            break

    assert msgs == set(['msg %d' % i for i in range(messages)])
    consumer.close()
Example #49
0
def test_end_to_end(kafka_broker, compression):

    if compression == 'lz4':
        # LZ4 requires 0.8.2
        if version() < (0, 8, 2):
            return
        # LZ4 python libs dont work on python2.6
        elif sys.version_info < (2, 7):
            return

    connect_str = 'localhost:' + str(kafka_broker.port)
    producer = KafkaProducer(bootstrap_servers=connect_str,
                             retries=5,
                             max_block_ms=10000,
                             compression_type=compression,
                             value_serializer=str.encode)
    consumer = KafkaConsumer(bootstrap_servers=connect_str,
                             group_id=None,
                             consumer_timeout_ms=10000,
                             auto_offset_reset='earliest',
                             value_deserializer=bytes.decode)

    topic = random_string(5)

    messages = 100
    futures = []
    for i in range(messages):
        futures.append(producer.send(topic, 'msg %d' % i))
    ret = [f.get(timeout=30) for f in futures]
    assert len(ret) == messages

    producer.close()

    consumer.subscribe([topic])
    msgs = set()
    for i in range(messages):
        try:
            msgs.add(next(consumer).value)
        except StopIteration:
            break

    assert msgs == set(['msg %d' % i for i in range(messages)])
Example #50
0
    def process_data(self, msg):
        result = 'ok'
        _data = msg['filename'] + ': ' + msg['data']
        self.log.debug(msg['collectors'] + _data)

        producer = KafkaProducer(bootstrap_servers=self.kfk_server)

        future = producer.send(self.topic, _data)
        # Block for 'synchronous' sends
        try:
            record_metadata = future.get(timeout=10)
        except KafkaError:
            # Decide what to do if produce request failed...
            self.log.error(traceback.format_exc())
            result = 'Fail'
        finally:
            producer.close()

        # return record_metadata.topic, record_metadata.partition, record_metadata.offset
        return result,
Example #51
0
class SimpleProducer(BaseStreamProducer):
    def __init__(self, location, topic, compression):
        self._location = location
        self._topic = topic
        self._compression = compression
        self._create()

    def _create(self):
        self._producer = KafkaProducer(bootstrap_servers=self._location, retries=5,
                                       compression_type=self._compression)

    def send(self, key, *messages):
        for msg in messages:
            self._producer.send(self._topic, value=msg)

    def flush(self):
        self._producer.flush()

    def close(self):
        self._producer.close()
Example #52
0
def run(argv):
    old_client = False

    if len(argv) > 4:
        test_duration = argv[1]
        msg_batch = argv[2]
        msg_requested_rate = argv[3]
        topic_name = argv[4]
        acks = argv[5]
        linger_ms = argv[6]
        msg_batch = int(msg_batch)
        msg_requested_rate = float(msg_requested_rate)
        test_duration = float(test_duration)
        topic_name = str(topic_name)
        acks = int(acks)
        linger_ms = int(linger_ms)

    # Initialize Kafka PUB Server
    l.info("Starting Kafka Publisher (producer)")
    # Estimate average message size to compute batch_size in [bytes] / Requested by Kafka
    min_message_size = len(str(0) + ' msg' + str(0))
    max_message_size = len(str(msg_requested_rate) + ' msg' + str(msg_requested_rate))
    average_message_size = (min_message_size + max_message_size) / 2
    batch_estimated_size = (average_message_size) * msg_batch
    l.info("Message Average Size is: [%s]. Kafka Batch Size in Bytes set to: [%s]" % (average_message_size,
                                                                                      batch_estimated_size))
    if old_client:
        producer = KafkaProducer(bootstrap_servers=['localhost:9092'], batch_size=batch_estimated_size,
                                 linger_ms=linger_ms, acks=acks)
    else:
        client = KafkaClient(hosts='localhost:9092')
        topic = client.topics[topic_name]
        producer = topic.get_producer(min_queued_messages=batch_estimated_size, linger_ms=linger_ms, required_acks=acks)

    # Initialize simple Rep server, this is used to listen
    # for the signal to start sending data
    pub_rep_port = os.environ.get('PORT0')
    l.info("STARTING KAFKA REP server at port [%s].", pub_rep_port)
    run_data = {'start': False,
                'stats': {'rate': 0, 'msg_cnt': 0},
                'test_status': 'stopped'}
    pub_metrics = {'test_duration': test_duration,
                   'msg_batch': msg_batch,
                   'msg_requested_rate': msg_requested_rate}
    hd = HDKafkapRepSrv(pub_rep_port, run_data, pub_metrics)
    hd.run()

    while True:
        #  Wait for 'signal' to start sending messages to Kafka Broker
        if not run_data['start']:
            l.debug("KAFKA PUB WAITING FOR SIGNAL...")
            time.sleep(1)
            continue
        l.info('PUB server initiating... Test Duration [%f] secs. Messages with batches [%d]'
               'and requested msg rate [%f]' % (hd.test_duration, hd.msg_batch, hd.msg_requested_rate))
        cnt = 0
        msg_cnt = 0
        start_time = time.time()

        # Start Publishing Messages to Broker
        while True:
            # Build 'message'
            messagedata = "msg%d" % msg_cnt
            message = "%d %s" % (msg_cnt, messagedata)

            try:
                # Publish message to the Kafka Cluster
                # topic: specifies the 'topic' where the message will be published
                if old_client:
                    producer.send(topic=topic_name, value=message)
                else:
                    producer.produce(message)
            except KafkaTimeoutError as e:
                l.error("Unable to publish message to the Kafka Cluster. ERROR: %s" % e.message)

            # Insert a 'delay' if tx rate between batches outperforms the expected
            # (minimum) rate to achieve requested tx rate
            cnt += 1
            msg_cnt += 1
            if cnt >= hd.msg_batch:
                # Compute the delay
                duration = time.time() - start_time
                expected_time = msg_cnt / hd.msg_requested_rate
                delay = 0.0
                if expected_time > duration:
                    delay = expected_time - duration
                if delay > 1:
                    delay = 1
                time.sleep(delay)
                cnt = 0
            elapsed_time = time.time() - start_time
            if elapsed_time >= hd.test_duration:
                break
        # Update 'stats' to 'hd' (HDaemon)
        run_data['stats']['time:end'] = json.dumps(time.time())
        run_data['stats']['rate'] = msg_cnt / elapsed_time
        run_data['stats']['msg_cnt'] = msg_cnt
        process = psutil.Process()
        run_data['stats']['net:end'] = json.dumps(psutil.net_io_counters())
        run_data['stats']['cpu:end'] = json.dumps(process.cpu_times())
        run_data['stats']['mem:end'] = json.dumps(process.memory_info())
        run_data['test_status'] = 'stopping'
        # Go back to waiting for the next test
        run_data['start'] = False
        continue
    producer.close()
    l.info("PUB Server stopping after sending %d messages elapsed time %f and message rate %f" %
           (msg_cnt, elapsed_time, run_data['stats']['rate']))
class KafkaSender(LogSender):
    def __init__(self, config, msg_buffer, stats):
        super().__init__(config=config, msg_buffer=msg_buffer, stats=stats,
                         max_send_interval=config.get("max_send_interval", 0.3))
        self.config = config
        self.msg_buffer = msg_buffer
        self.stats = stats

        self.kafka_producer = None

        topic = self.config["kafka_topic"]
        if isinstance(self.config["kafka_topic"], bytes):
            topic = topic.decode("utf8")
        self.topic = topic

    def _init_kafka(self):
        self.log.info("Initializing Kafka producer, address: %r", self.config["kafka_address"])
        while self.running:
            try:
                if self.kafka_producer:
                    self.kafka_producer = self.kafka_producer.close()
                    self.kafka_producer = None

                producer_config = {"bootstrap_servers": self.config["kafka_address"],
                                   "security_protocol": "SSL" if self.config.get("ssl") else "PLAINTEXT",
                                   "ssl_certfile": self.config.get("certfile"),
                                   "ssl_keyfile": self.config.get("keyfile"),
                                   "ssl_cafile": self.config.get("ca"),
                                   "compression_type": "snappy" if snappy else None}

                self.kafka_producer = KafkaProducer(**producer_config)

                self.log.info("Initialized Kafka producer, address: %r", self.config["kafka_address"])
                break
            except KAFKA_CONN_ERRORS as ex:
                self.log.warning("Retriable error during Kafka initialization: %s: %s, sleeping",
                                 ex.__class__.__name__, ex)

            self.kafka_producer.close()
            self.kafka_producer = None
            time.sleep(5.0)

    def send_messages(self, message_batch):
        if not self.kafka_producer:
            self._init_kafka()
        try:
            for message in message_batch:
                self.kafka_producer.send(self.topic, message)
            return True
        except KAFKA_CONN_ERRORS as ex:
            self.log.info("Kafka retriable error during send: %s: %s, waiting", ex.__class__.__name__, ex)
            time.sleep(0.5)
            self._init_kafka()
        except Exception as ex:  # pylint: disable=broad-except
            self.log.exception("Unexpected exception during send to kafka")
            self.stats.unexpected_exception(ex=ex, where="sender", tags={"app": "journalpump"})
            time.sleep(5.0)
            self._init_kafka()

    def _cleanup(self):
        if self.kafka_producer:
            self.kafka_producer.close()
Example #54
0
class KafkaSource(StoqSourcePlugin):

    def __init__(self):
        super().__init__()

    def activate(self, stoq):
        self.stoq = stoq

        super().activate()

        self.producer = None

    def ingest(self):
        """
        Monitor Kafka for messages

        """

        # Define our Kafka topic
        topic = self.stoq.worker.name

        # If this is an error message, let's make sure our topic
        # has "-errors" affixed to it
        if self.stoq.worker.error_queue is True:
            topic = topic + "-errors".strip()

        consumer = KafkaConsumer(topic,
                                 group_id=self.group,
                                 auto_offset_reset='earliest',
                                 bootstrap_servers=self.servers_list)

        self.log.info("Monitoring {} topic for messages...".format(topic))

        for message in consumer:
            # Setup the amqp message for parsing
            msg = self.stoq.loads(message.value)

            # Send the message to the worker
            self.stoq.worker.multiprocess_put(**msg)

    def producer_connect(self):
        """
        Connect to Kafka to publish a message

        """
        self.producer = KafkaProducer(bootstrap_servers=self.servers_list,
                                      retries=self.retries)

    def producer_release(self):
        """
        Release AMQP connection used for publishing

        """
        return self.producer.close()

    def publish(self, msg, topic, err=False, **kwargs):
        """
        Publish a message to Kafka

        :param dict msg: Message to be published
        :param str topic: Topic to be used, should be name of worker
        :param bool err: Define whether we should process error topic

        """

        # Make sure we have a valid connection to RabbitMQ
        if not self.producer:
            self.producer_connect()

        # If this is an error message, let's make sure our queue
        # has "-errors" affixed to it
        if err:
            topic = topic + "-errors".strip()

        try:
            self.producer.send(topic, self.stoq.dumps(msg).encode())
        except:
            self.log.error("Unable to publish message to Kafka server: {}".format(msg))
class KafkaPipeline(BasePipeline):
    TOPIC = 'craigslist'
    SERIALIZER = MsgPackSerializer()

    def start(self, crawler):

        # TODO: remove this hack
        # HACK
        log.debug("Wait 5s to allow kafka node to be ready")
        time.sleep(5)

        endpoints = list(get_kafka_endpoints())
        log.debug("Connect to kafka as producer - %s", endpoints)
        if not endpoints:
            raise RuntimeError("Kafka endpoints not defined")
        self.producer = KafkaProducer(bootstrap_servers=endpoints)

    def process(self, crawler, item):
        self.producer.send(
            self.TOPIC,
            self.SERIALIZER.dumps(item),
        )
        return item

    def stop(self, crawler):
        self.producer.flush()
        self.producer.close()

    @classmethod
    def dump_data(
            cls, topic=None, timeout=None, poll_timeout=None,
            enable_auto_commit=False):

        # TODO: remove this hack
        # HACK
        log.debug("Wait 5s to allow kafka node to be ready")
        time.sleep(5)

        topic = topic or cls.TOPIC
        endpoints = list(get_kafka_endpoints())
        log.debug("Connect to kafka as consumer - %s", endpoints)
        if not endpoints:
            raise RuntimeError("Kafka endpoints not defined")

        consumer = KafkaConsumer(
            topic,
            auto_offset_reset='earliest',
            enable_auto_commit=enable_auto_commit,
            value_deserializer=cls.SERIALIZER.loads,
            bootstrap_servers=endpoints,
            consumer_timeout_ms=timeout or -1,
        )

        # TODO use native kafka-python poll
        if poll_timeout:
            while True:
                yield list(data.value for data in consumer)
                time.sleep(poll_timeout / 1000.0)
        else:
            for data in consumer:
                yield data.value

        consumer.close()

    @classmethod
    def dump_to_csv(cls, to_file, topic=None, timeout=None):
        log.debug("Dump topic <%s> to %s", topic, to_file)

        csv_pipeline = CsvPipeline(to_file)
        csv_pipeline.start(None)

        for item in cls.dump_data(topic, timeout):
            # we must reinitialize item to restore fields and values ordering
            csv_pipeline.process(
                None,
                CraigsListItem(**dict(
                    # convert dict byte keys to string keys and use it as
                    # keywords
                    (k.decode(), v) for k, v in item.items()
                ))
            )

        csv_pipeline.stop(None)
Example #56
0
# producer
import time
from kafka import KafkaProducer
 
producer = KafkaProducer(bootstrap_servers=['localhost:9092'])  #此处ip可以是多个['0.0.0.1:9092','0.0.0.2:9092','0.0.0.3:9092' ]
 
for i in range(300):
    ts =int(time.time()*1000)
    msg = "produce yao + msg%d" % i
    print(msg)
    producer.send("test", msg.encode('utf-8'))
    time.sleep(1)
producer.close()

Example #57
0
from kafka import KafkaProducer
import avro.schema
import io
from avro.io import DatumWriter

data = {'name': 'Tony', 'favorite_number': 8, 'favorite_color': 'green'}

schema = avro.schema.parse(open('./schema.avsc').read())

def serialize(data):
    writer = DatumWriter(schema)
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    writer.write(data, encoder)
    return bytes_writer.getvalue()


producer = KafkaProducer(bootstrap_servers=['localhost:9092'], value_serializer=serialize)
producer.send('test2', data)
producer.flush()
producer.close() # close will also flush, but I'm leaving it in here for demonstration purposes
class DocManager(DocManagerBase):

    def __init__(self, url, auto_commit_interval=1, unique_key='_id', chunk_size=10):
        try:
            from kafka import KafkaProducer
        except ImportError:
            raise SystemError

        self.producer = KafkaProducer(bootstrap_servers=[url])

    def get_topic_key(self, namespace):
        if namespace == 'timetracker.session':
            topic_key = 'session'
        else:
            topic_key = 'activity'
        return topic_key

    def doc_to_message_data(self, doc, namespace, timestamp):
        data = {'timestamp': timestamp,
                'namespace': namespace,
                'action': 'upsert',
                'data': doc}
        return str(data)

    def update_to_message_data(self, doc_id, update_spec, namespace, timestamp):
        data = {'timestamp': timestamp,
                'namespace': namespace,
                'document_id': doc_id,
                'action': 'update',
                'data': update_spec['$set']
                }
        return str(data)

    def remove_to_message_data(self, doc_id, namespace, timestamp):
        data = {'timestamp': timestamp,
                'namespace': namespace,
                'document_id': doc_id,
                'action': 'remove'
                }
        return str(data)

    def publish_data(self, data, topic_key):
        self.producer.send(TOPICS[topic_key], data)
        self.producer.flush()

    def stop(self):
        self.producer.close()

    def upsert(self, doc, namespace, timestamp):
        data = self.doc_to_message_data(doc, namespace, timestamp)
        topic_key = self.get_topic_key(namespace)
        self.publish_data(data, topic_key)


    def update(self, document_id, update_spec, namespace, timestamp):
        data = self.update_to_message_data(document_id, update_spec, namespace, timestamp)
        topic_key = self.get_topic_key(namespace)
        self.publish_data(data, topic_key)

    def remove(self, document_id, namespace, timestamp):
        data = self.remove_to_message_data(document_id, namespace, timestamp)
        topic_key = self.get_topic_key(namespace)
        self.publish_data(data, topic_key)

    def search(selfself, start_ts, end_ts):
        raise NotImplementedError

    def commit(self):
        raise NotImplementedError

    def get_last_doc(self):
        raise NotImplementedError

    def handle_command(self, doc, namespace, timestamp):
        pass
Example #59
0
class JournaldStream(object):
    messages_steps = 100
    logs_topic_name = "logs"
    kafka_sleep = 1

    def __init__(self, kafka_hosts, journald_path, sincedb_path):

        # Sincedb is a file where the __CURSOR of Journald is stored
        self.sincedb_path = self._force_type_value(str, sincedb_path)
        self._read_or_create_sincedb(self.sincedb_path)

        # /run/log/journal
        self.journald_path = self._force_type_value(str, journald_path)
        self._is_journal_dir(self.journald_path)
        self.reader = journal.Reader(path=self.journald_path, converters=BASIC_CONVERTERS)

        # Kafka hosts
        self.kafka_hosts = self._force_type_value(list, kafka_hosts)
        self.producer = KafkaProducer(
            bootstrap_servers=self.kafka_hosts,
            value_serializer=lambda v: json.dumps(v))

        self.cursor = ""
        self.read_messages = 0
        self.key_filters = self._build_key_filters()
        self.value_filters = lambda x: x

    @staticmethod
    def _read_or_create_sincedb(sincedb_path):
        if os.path.isfile(sincedb_path):
            with open(sincedb_path, 'r') as db:
                db.read()
        else:
            with open(sincedb_path, "w") as empty_db:
                empty_db.write("")

    @staticmethod
    def _is_journal_dir(journald_path):
        if not os.path.isdir(journald_path):
            raise IOError("%s not here" % journald_path)

    @staticmethod
    def _build_key_filters():
        """
        Transform the keys of a dict
        :return: list of functions
        """

        def remove_prefix(key, prefix="_"):
            """
            Journald create keys with '_', '__' prefix
            :param key:
            :param prefix:
            :return: Key reformatted
            """
            new = key
            while new[0] == prefix:
                new = new[1:]
            return new

        def lower_key(key):
            return key.lower()

        def aggregate_filters(key):
            for f in [remove_prefix, lower_key]:
                key = f(key)
            return key

        return aggregate_filters

    @staticmethod
    def _force_type_value(type_want, variable):
        """
        Raise TypeError is the type is not matching
        :param type_want:
        :param variable:
        :return: variable
        """
        if type_want is not type(variable):
            raise TypeError("%s is not type(%s)" % (type_want, type(variable)))

        return variable

    def _save_cursor(self):
        if self.cursor != "":
            with open(self.sincedb_path, 'w') as f:
                f.write(self.cursor)
        else:
            os.write(2, "invalid cursor\n")

    def _get_cursor(self):
        try:
            with open(self.sincedb_path, 'r') as f:
                self.cursor = f.read()
                return True if self.cursor else False
        except IOError:
            return False

    def _stream_to_seek(self):
        if self._get_cursor():
            os.write(1, "using saved cursor \"%s\"\n" % self.cursor)
            self.reader.seek_cursor(self.cursor)
            self.reader.get_next()
        else:
            os.write(1, "using new cursor\n")

        for log in self.reader:
            self._kafka_send(log)

        os.write(1, "seeked journal after %d messages\n" % self.read_messages)

    def _stream_poller(self):
        i = 0
        os.write(1, "start polling realtime messages\n")
        while self.reader.get_events():
            i += 1
            if self.reader.process() == journal.APPEND:
                for log in self.reader:
                    self._kafka_send(log)
            else:
                time.sleep(self.kafka_sleep)
            self._periodic_stream_task(i)

    def stream(self):
        """
        Public method
        """
        self._stream_to_seek()
        self._stream_poller()

    def _periodic_send_task(self):
        if self.read_messages % self.messages_steps == 0:
            os.write(1, "read %d messages, process flush\n" % self.read_messages)
            ts = time.time()
            self.producer.flush()
            os.write(1, "flush done in %d\n" % (time.time() - ts))

    @staticmethod
    def _periodic_stream_task(nb_message):
        pass

    def _filters(self, full_log):
        # Keys
        filter_data = {self.key_filters(k): self.value_filters(v) for k, v in full_log.iteritems()}

        # Values
        # Handle by BASIC_CONVERTERS Journal builtin

        return filter_data

    def _kafka_send(self, full_log):
        # Transform the log
        filter_data = self._filters(full_log)

        # Send it to Kafka
        self.producer.send(self.logs_topic_name, filter_data)

        # Save the cursor
        self.cursor = full_log["__CURSOR"]
        self._save_cursor()

        # Internal instance stats
        self.read_messages += 1
        self._periodic_send_task()

    def close(self):
        os.write(1, "closing journald.Reader\n")
        self.reader.close()
        os.write(1, "closing kafka connection\n")
        self.producer.close()
    def run(args):
        try:
            props = {}
            for prop in args.consumer_config:
                k, v = prop.split('=')
                try:
                    v = int(v)
                except ValueError:
                    pass
                if v == 'None':
                    v = None
                props[k] = v

            if args.brokers:
                brokers = start_brokers(args.brokers)
                props['bootstrap_servers'] = ['{0}:{1}'.format(broker.host, broker.port)
                                              for broker in brokers]
                print('---> bootstrap_servers={0}'.format(props['bootstrap_servers']))
                print()

                print('-> Producing records')
                record = bytes(bytearray(args.record_size))
                producer = KafkaProducer(compression_type=args.fixture_compression,
                                         **props)
                for i in xrange(args.num_records):
                    producer.send(topic=args.topic, value=record)
                producer.flush()
                producer.close()
                print('-> OK!')
                print()

            print('Initializing Consumer...')
            props['auto_offset_reset'] = 'earliest'
            if 'consumer_timeout_ms' not in props:
                props['consumer_timeout_ms'] = 10000
            props['metrics_sample_window_ms'] = args.stats_interval * 1000
            for k, v in props.items():
                print('---> {0}={1}'.format(k, v))
            consumer = KafkaConsumer(args.topic, **props)
            print('---> group_id={0}'.format(consumer.config['group_id']))
            print('---> report stats every {0} secs'.format(args.stats_interval))
            print('---> raw metrics? {0}'.format(args.raw_metrics))
            timer_stop = threading.Event()
            timer = StatsReporter(args.stats_interval, consumer,
                                  event=timer_stop,
                                  raw_metrics=args.raw_metrics)
            timer.start()
            print('-> OK!')
            print()

            records = 0
            for msg in consumer:
                records += 1
                if records >= args.num_records:
                    break
            print('Consumed {0} records'.format(records))

            timer_stop.set()

        except Exception:
            exc_info = sys.exc_info()
            traceback.print_exception(*exc_info)
            sys.exit(1)