Ejemplo n.º 1
0
def main(args):
    topic = args.topic
    delimiter = args.delimiter

    producer_conf = producer_config(args)

    producer = SerializingProducer(producer_conf)

    print('Producing records to topic {}. ^C to exit.'.format(topic))
    while True:
        # Serve on_delivery callbacks from previous calls to produce()
        producer.poll(0.0)
        try:
            msg_data = input(">")
            msg = msg_data.split(delimiter)
            if len(msg) == 2:
                producer.produce(topic=topic, key=msg[0], value=msg[1],
                                 on_delivery=delivery_report)
            else:
                producer.produce(topic=topic, value=msg[0],
                                 on_delivery=delivery_report)
        except KeyboardInterrupt:
            break

    print('\nFlushing {} records...'.format(len(producer)))
    producer.flush()
def main(args):
    topic = args.topic
    delimiter = args.delimiter
    producer_conf = {'bootstrap.servers': args.bootstrap_servers,
                     'key.serializer': StringSerializer('utf_8'),
                     'value.serializer': StringSerializer('utf_8')}

    producer_conf.update(sasl_conf(args))

    producer = SerializingProducer(producer_conf)

    print("Producing records to topic {}. ^C to exit.".format(topic))
    while True:
        # Serve on_delivery callbacks from previous calls to produce()
        producer.poll(0.0)
        try:
            msg_data = input(">")
            msg = msg_data.split(delimiter)
            if len(msg) == 2:
                producer.produce(topic=topic, key=msg[0], value=msg[1],
                                 on_delivery=delivery_report)
            else:
                producer.produce(topic=topic, value=msg[0],
                                 on_delivery=delivery_report)
        except KeyboardInterrupt:
            break

    print("\nFlushing {} records...".format(len(producer)))
    producer.flush()
def main(args):
    topic = args.topic

    schema_str = """
    {
        "namespace": "confluent.io.examples.serialization.avro",
        "name": "User",
        "type": "record",
        "fields": [
            {"name": "name", "type": "string"},
            {"name": "favorite_number", "type": "int"},
            {"name": "favorite_color", "type": "string"}
        ]
    }
    """
    schema_registry_conf = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(schema_registry_conf)

    avro_serializer = AvroSerializer(schema_registry_client, schema_str,
                                     user_to_dict)

    producer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.serializer': StringSerializer('utf_8'),
        'value.serializer': avro_serializer
    }

    producer = SerializingProducer(producer_conf)

    print("Producing user records to topic {}. ^C to exit.".format(topic))
    while True:
        # Serve on_delivery callbacks from previous calls to produce()
        producer.poll(0.0)
        try:
            user_name = input("Enter name: ")
            user_address = input("Enter address: ")
            user_favorite_number = int(input("Enter favorite number: "))
            user_favorite_color = input("Enter favorite color: ")
            user = User(name=user_name,
                        address=user_address,
                        favorite_color=user_favorite_color,
                        favorite_number=user_favorite_number)
            producer.produce(topic=topic,
                             key=str(uuid4()),
                             value=user,
                             on_delivery=delivery_report)
        except KeyboardInterrupt:
            break
        except ValueError:
            print("Invalid input, discarding record...")
            continue

    print("\nFlushing records...")
    producer.flush()
Ejemplo n.º 4
0
def send_record(args):
    """ Sends Record using a SerializingProducer & AvroSerializer """
    topic = args.topic.rstrip()

    schema_registry_config = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(schema_registry_config)

    avro_serializer = AvroSerializer(schema_registry_client, DATA_SCHEMA,
                                     data_to_dict)

    producer_config = {
        "bootstrap.servers": args.bootstrap_servers,
        "key.serializer": StringSerializer('utf_8'),
        "value.serializer": avro_serializer
    }
    producer = SerializingProducer(producer_config)

    split_incoming_data = args.record_value.split(',')
    if not len(split_incoming_data) == 7:  # Data Format Check
        print('** Error: Insufficient Incoming Data: ', split_incoming_data)
        raise Exception
    try:  # Data Format Check
        incoming_data = {
            'envId': int(split_incoming_data[0]),
            'whenCollected': str(split_incoming_data[1]),
            'timeLightOnMins': int(split_incoming_data[2]),
            'humidity': int(split_incoming_data[3]),
            'soilMoisture': int(split_incoming_data[4]),
            'temperature': int(split_incoming_data[5]),
            'waterConsumption': int(split_incoming_data[6])
        }
    except Exception as error:
        print('** Error Creating Dict of Data: ', error)

    print(f'Producing data records to topic {topic}. ^C to exit.')
    producer.poll(1)
    try:
        key = args.record_key if args.record_key else str(uuid4())
        data_object = Data(incoming_data)
        print('\t-Producing Avro record. . .')
        producer.produce(topic=topic,
                         key=key,
                         value=data_object,
                         on_delivery=delivery_report)
    except ValueError:
        print('\t-Invalid input, discarding record. . .')
    print('\nFlushing records. . .')
    producer.flush()
Ejemplo n.º 5
0
class DatahubKafkaEmitter:
    def __init__(self, config: KafkaEmitterConfig):
        self.config = config

        schema_registry_conf = {
            "url": self.config.connection.schema_registry_url,
            **self.config.connection.schema_registry_config,
        }
        schema_registry_client = SchemaRegistryClient(schema_registry_conf)

        def convert_mce_to_dict(
            mce: MetadataChangeEvent, ctx: SerializationContext
        ) -> dict:
            tuple_encoding = mce.to_obj(tuples=True)
            return tuple_encoding

        avro_serializer = AvroSerializer(
            schema_str=SCHEMA_JSON_STR,
            schema_registry_client=schema_registry_client,
            to_dict=convert_mce_to_dict,
        )

        producer_config = {
            "bootstrap.servers": self.config.connection.bootstrap,
            "key.serializer": StringSerializer("utf_8"),
            "value.serializer": avro_serializer,
            **self.config.connection.producer_config,
        }

        self.producer = SerializingProducer(producer_config)

    def emit_mce_async(
        self,
        mce: MetadataChangeEvent,
        callback: Callable[[Exception, str], None],
    ) -> None:
        # Call poll to trigger any callbacks on success / failure of previous writes
        self.producer.poll(0)
        self.producer.produce(
            topic=self.config.topic,
            key=mce.proposedSnapshot.urn,
            value=mce,
            on_delivery=callback,
        )

    def flush(self) -> None:
        self.producer.flush()
def main(args):
    topic = args.topic

    key_schema_str = open('schema/KeySchema.avsc', "r").read()
    value_schema_str = open('schema/ValueSchema.avsc', "r").read()
    schema_registry_conf = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(schema_registry_conf)

    avro_key_serializer = AvroSerializer(key_schema_str, schema_registry_client, user_quote_key_to_dict)
    avro_value_serializer = AvroSerializer(value_schema_str, schema_registry_client, user_quote_value_to_dict)

    producer_conf = {'bootstrap.servers': args.bootstrap_servers,
                     'key.serializer': avro_key_serializer,
                     'value.serializer': avro_value_serializer}

    producer = SerializingProducer(producer_conf)

    print("Producing user records to topic {}. ^C to exit.".format(topic))
    while True:
        # Serve on_delivery callbacks from previous calls to produce()
        producer.poll(0.0)
        try:
            user_id = input("Enter User ID: ")
            product_id = input("Enter Product ID: ")
            quoted_price = input("Enter price: ")
            quoted_quantity = int(input("Enter the desired quantity: "))
            user_note = input("Enter additional note: ")

            user_quote_key = UserQuoteKey(user_id=int(user_id))

            user_quote_value = UserQuoteValue(product_id=int(product_id),
                                              quoted_price=int(quoted_price),
                                              quoted_quantity=quoted_quantity,
                                              user_note=user_note)

            producer.produce(topic=topic, key=user_quote_key, value=user_quote_value,
                             on_delivery=delivery_report)
        except KeyboardInterrupt:
            break
        except ValueError:
            print("Invalid input, discarding record...")
            continue

    print("\nFlushing records...")
    producer.flush()
Ejemplo n.º 7
0
class KafkaProducer:
    def __init__(self, topic, producer_config):
        self._topic = topic
        self._producer = SerializingProducer(producer_config.dict)

    def produce(self, record):
        while True:
            try:
                self._producer.produce(topic=self._topic,
                                       key=record.key_to_avro_dict(),
                                       value=record.value_to_avro_dict(),
                                       on_delivery=self._delivery_report)
                self._producer.poll(0)
                break
            except BufferError as e:
                print(f'Failed to send on attempt {record}. '
                      f'Error received {str(e)}')
                self._producer.poll(1)

    def flush(self):
        if self._producer:
            self._producer.flush()

    @staticmethod
    def _delivery_report(err: KafkaError, msg: Message):
        """ Reports the failure or success of a message delivery.

        Note:
            In the delivery report callback the Message.key()
            and Message.value() will be the binary format as
            encoded by any configured Serializers and
            not the same object that was passed to produce().
            If you wish to pass the original object(s)
            for key and value to delivery
            report callback we recommend a bound callback
            or lambda where you pass the objects along.

        Args:
            err ([KafkaError]): The error that occurred on None on success.
            msg ([Message]): The message that was produced or failed.
        """

        if err is not None:
            print(f"Delivery failed for record {msg.key()}: {err}")
Ejemplo n.º 8
0
class VideoProducer:
    def __init__(self,
                 topic='test',
                 client_id='producer1',
                 bootstrap_servers='localhost:9092',
                 video_reader=None):
        self.topic = topic
        self.video_reader = video_reader
        self.kafka_producer = SerializingProducer({
            'bootstrap.servers':
            bootstrap_servers,
            'value.serializer':
            self.video_reader.serialize,
            'queue.buffering.max.messages':
            500000
        })
        self.delivered_records = 0
        self.start_time = 0

    def acked(self, err, msg):
        """Delivery report handler called on
        successful or failed delivery of message
        """
        if err is not None:
            print("Failed to deliver message: {}".format(err))
        else:
            self.delivered_records += 1
        # print(sys.getsizeof(message))

    def produce(self):
        start_time = time.time()
        while (time.time() - start_time < 60 and self.video_reader.online):
            self.kafka_producer.poll(0.0)
            frame = self.video_reader.read()
            if frame is not None:
                self.kafka_producer.produce(topic=self.topic,
                                            value=frame,
                                            on_delivery=self.acked)
        print("\nFlushing records...")
        self.kafka_producer.flush()
        finished_time = time.time()
        print("MPS: {}".format(self.delivered_records /
                               (finished_time - start_time)))
        self.video_reader.release()
def main(args):
    topic = args.topic

    schema_registry_conf = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(schema_registry_conf)

    protobuf_serializer = ProtobufSerializer(user_pb2.User,
                                             schema_registry_client,
                                             {'use.deprecated.format': True})

    producer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.serializer': StringSerializer('utf_8'),
        'value.serializer': protobuf_serializer
    }

    producer = SerializingProducer(producer_conf)

    print("Producing user records to topic {}. ^C to exit.".format(topic))
    while True:
        # Serve on_delivery callbacks from previous calls to produce()
        producer.poll(0.0)
        try:
            user_name = input("Enter name: ")
            user_favorite_number = int(input("Enter favorite number: "))
            user_favorite_color = input("Enter favorite color: ")
            user = user_pb2.User(name=user_name,
                                 favorite_color=user_favorite_color,
                                 favorite_number=user_favorite_number)
            producer.produce(topic=topic,
                             partition=0,
                             key=str(uuid4()),
                             value=user,
                             on_delivery=delivery_report)
        except (KeyboardInterrupt, EOFError):
            break
        except ValueError:
            print("Invalid input, discarding record...")
            continue

    print("\nFlushing records...")
    producer.flush()
Ejemplo n.º 10
0
def produce(goal):
    count = 0
    reusableProducer = SerializingProducer(getConfigs())
    while (count < goal):
        try:
            reusableProducer.produce(topic='myprototopic',
                                     key=str(uuid4()),
                                     value=generateRecord(),
                                     on_delivery=getReport)
            # print("In process:{}".format(multiprocessing.current_process().name))
            reusableProducer.poll(0.0)
        except KeyboardInterrupt:
            break
        except BufferError:
            sys.stderr.write(
                '%% Local producer queue is full (%d messages awaiting delivery): flushing...\n'
                % len(reusableProducer))
            reusableProducer.flush()

    print("Flushing one producer thread")
    reusableProducer.flush()
Ejemplo n.º 11
0
    def produce(self, count: int):
        def increment(err, msg):
            assert err is None
            assert msg is not None
            assert msg.offset() == self.acked
            self.logger.debug("Acked offset %d", msg.offset())
            self.acked += 1

        producer = SerializingProducer({
            'bootstrap.servers':
            self.brokers,
            'key.serializer':
            StringSerializer('utf_8'),
            'value.serializer':
            self._make_serializer()
        })

        self.logger.info("Producing %d %s records to topic %s", count,
                         self.schema_type.name, self.topic)
        for i in range(count):
            # Prevent overflow of buffer
            while len(producer) > 50000:
                # Serve on_delivery callbacks from previous calls to produce()
                producer.poll(0.1)

            producer.produce(topic=self.topic,
                             key=str(uuid4()),
                             value=self._make_payload(i),
                             on_delivery=increment)
            self.produced += 1

        self.logger.info("Flushing records...")
        producer.flush()
        self.logger.info("Records flushed: %d", self.produced)
        while self.acked < count:
            producer.poll(0.01)
        self.logger.info("Records acked: %d", self.acked)
Ejemplo n.º 12
0
def main(args):
    topic = args.topic

    schema_registry_conf = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(schema_registry_conf)

    avro_serializer = AvroSerializer(User.avro_schema(),
                                     schema_registry_client,
                                     user_to_dict)

    producer_conf = {'bootstrap.servers': args.bootstrap_servers,
                     'key.serializer': StringSerializer('utf_8'),
                     'value.serializer': avro_serializer}

    producer = SerializingProducer(producer_conf)

    print(f"Producing user records to topic {topic}. ^C to exit.")
    while True:
        producer.poll(0.0)
        try:
            user_name = input("Enter name: ")
            user_favorite_number = int(input("Enter favorite number: "))
            user_favorite_color = input("Enter favorite color: ")
            user = User(name=user_name,
                        favorite_color=user_favorite_color,
                        favorite_number=user_favorite_number)
            producer.produce(topic=topic, key=str(uuid4()), value=user,
                             on_delivery=delivery_report)
        except KeyboardInterrupt:
            break
        except ValueError:
            print("Invalid input, discarding record...")
            continue

    print("\nFlushing records...")
    producer.flush()
Ejemplo n.º 13
0
    def test_producer(self):
        # Read arguments and configurations and initialize
        producer_config = {
            'bootstrap.servers': self.conf['bootstrap.servers'],
            'key.serializer': self.key_avro_serializer,
            'value.serializer': self.value_avro_serializer
        }
        producer = SerializingProducer(producer_config)

        delivered_records = 0
        for text in self.test_messages:
            url = 'www.test.com'
            scraper_dt = datetime.now(pytz.timezone('America/Denver'))
            scraper_dt = scraper_dt.strftime("%Y/%m/%d %H:%M:%S %z")
            value_obj = google.Value(text=text, scraper_dt=scraper_dt)
            key_obj = google.Key(url=(url))
            producer.produce(topic=self.topic,
                             key=key_obj,
                             value=value_obj,
                             on_delivery=kafka_utils.acked)
            delivered_records += producer.poll()
        producer.flush()

        assert delivered_records == len(self.test_messages)
Ejemplo n.º 14
0
# Load HopsWorks Kafka configuration
conf = toml.load('config.toml')
# Initialize a simple String serializer for the key
string_serializer = StringSerializer('utf_8')

producer_conf = {
    'bootstrap.servers': conf['hops']['url'] + ':' + conf['kafka']['port'],
    'security.protocol': 'SSL',
    'ssl.ca.location': conf['project']['ca_file'],
    'ssl.certificate.location': conf['project']['certificate_file'],
    'ssl.key.location': conf['project']['key_file'],
    'ssl.key.password': conf['project']['key_password'],
    'key.serializer': string_serializer,
    'value.serializer': string_serializer,
    'client.id': socket.gethostname()
}

print(producer_conf)

producer = SerializingProducer(producer_conf)

producer.produce(conf['kafka']['topic'],
                 key="key",
                 value="value",
                 on_delivery=acked)

# Wait up to 1 second for events. Callbacks will be invoked during
# this method call if the message is acknowledged.
producer.poll(1)
Ejemplo n.º 15
0
    # permanently failed delivery (after retries).
    def acked(err, msg):
        global delivered_records
        """Delivery report handler called on
        successful or failed delivery of message
        """
        if err is not None:
            print("Failed to deliver message: {}".format(err))
        else:
            delivered_records += 1
            print("Produced record to topic {} partition [{}] @ offset {}".
                  format(msg.topic(), msg.partition(), msg.offset()))

    for n in range(10):
        name_object = ccloud_lib.Name()
        name_object.name = "alice"
        count_object = ccloud_lib.Count()
        count_object.count = n
        print("Producing Avro record: {}\t{}".format(name_object.name,
                                                     count_object.count))
        producer.produce(topic=topic,
                         key=name_object,
                         value=count_object,
                         on_delivery=acked)
        producer.poll(0)

    producer.flush()

    print("{} messages were produced to topic {}!".format(
        delivered_records, topic))
Ejemplo n.º 16
0
class IFCBImageIngestor:
    def __init__(self, ifcb_url: str, producer_config: dict) -> None:
        logger.info(
            f"Ingesting image data to {producer_config['bootstrap.servers']} from {ifcb_url}"
        )
        self.ifcb_url = ifcb_url
        self.producer = SerializingProducer(producer_config)

    def publish_ifcb_image_feed(self,
                                ifcb_url: str,
                                topic: str,
                                nbins: int = 1) -> None:
        """Ingest newest N image bins to a Kafka topic from an IFCB dashboard"""
        logger.info(f'Ingesting last {nbins} bin(s) of data from {ifcb_url}')

        feed = requests.get(ifcb_url + 'feed.json')
        latest_bins = feed.json()
        for bin in latest_bins[:nbins]:
            self._download_and_publish_image_bin(bin['pid'], topic)

        # wait for outstanding messages and delievery reports
        self.producer.flush()

    def publish_ifcb_image_station(self,
                                   ifcb_url: str,
                                   topic: str,
                                   nbins: int = 1) -> None:
        """Ingest oldest N image bins to a Kafka topic from an IFCB dashboard"""
        logger.info(f'Streaming all bins of data from {ifcb_url}')

        bin_count = 0
        date = self._get_earliest_date()
        while date <= datetime.datetime.today() and bin_count < nbins:
            daily_bins = requests.get(
                ifcb_url + f"/api/feed/day/{date.strftime('%Y-%m-%d')}")
            for bin in daily_bins.json():
                try:
                    self._download_and_publish_image_bin(bin['pid'], topic)
                except:  # noqa
                    continue
                bin_count += 1

        # wait for outstanding messages and delievery reports
        self.producer.flush()

    def _get_earliest_date(self) -> datetime.datetime:
        """Given IFCB URL (namespace), return the earliest date that data is available"""
        # Use impossibly early date used to determine earliest date
        earliest = requests.get(self.ifcb_url + '/api/feed/nearest/2000-01-01')
        earliest_date = earliest.json()['date']
        return datetime.datetime.strptime(earliest_date, '%Y-%m-%dT%H:%M:%SZ')

    def _download_and_publish_image_bin(self, bin_pid: str,
                                        topic: str) -> None:
        """Download, unzip, and publish image from the bin pid (url)"""
        zip_url = bin_pid + '.zip'
        logger.info(f'loading {zip_url}')
        image_bin = IFCBZippedBin(zip_url)

        for roi in image_bin.filenames.keys():
            logger.debug(f'publishing {image_bin.filenames[roi]}')
            # DYYYYMMDDTHHMMSSZ_IFCBNNN_NNNNN.png
            self.producer.poll(0)
            self.producer.produce(topic=topic,
                                  key=image_bin.to_key(roi),
                                  value=json.dumps(image_bin.to_record(roi)),
                                  on_delivery=delivery_report)
Ejemplo n.º 17
0
def main(
    name: str,
    shutdown: multiprocessing.Value,
    request_queue: multiprocessing.Queue,
    config: Config
) -> None:
    """Execute tasks forever.

    This method is the entrypoint for the worker which executes the monitoring
    tasks. It is executed in a dedicate child process.
    """
    if config.verbose:
        logging.basicConfig(level=logging.INFO)
    log = logging.getLogger(name)
    log.info(f"Starting process {name}.")

    # SIGINT will be delivered to the whole process group. We'll need to ignore
    # it in the worker processes to give them the opportunity to finish any
    # pending work.
    signal.signal(signal.SIGINT, signal.SIG_IGN)

    schema_registry_client = SchemaRegistryClient({
        'url': config.schema_registry
    })
    avro_serializer = AvroSerializer(
        Report.SCHEMA,
        schema_registry_client,
        Report.asdict
    )

    producer = SerializingProducer({
        'client.id': name,
        'bootstrap.servers': config.bootstrap_servers,
        'key.serializer': StringSerializer('utf_8'),
        'security.protocol': 'SSL',
        'ssl.key.location': config.auth_key,
        'ssl.certificate.location': config.auth_cert,
        'ssl.ca.location': config.ca_cert,
        'value.serializer': avro_serializer,
    })
    err = _report_error(log)

    while not shutdown.value:
        producer.poll(0.0)
        try:
            now = datetime.now()
            req = request_queue.get(timeout=1)
            curl = pycurl.Curl()
            curl.setopt(pycurl.URL, req)
            curl.setopt(pycurl.CONNECTTIMEOUT, 30)
            curl.setopt(pycurl.TIMEOUT, 300)
            curl.setopt(pycurl.NOSIGNAL, 1)
            curl.setopt(pycurl.WRITEFUNCTION, len)
            try:
                curl.perform()
                report = Report(
                    timestamp=now.timestamp(),
                    url=req,
                    code=int(curl.getinfo(pycurl.RESPONSE_CODE)),
                    namelookup=curl.getinfo(pycurl.NAMELOOKUP_TIME),
                    connect=curl.getinfo(pycurl.CONNECT_TIME),
                    appconnect=curl.getinfo(pycurl.APPCONNECT_TIME),
                    pretransfer=curl.getinfo(pycurl.PRETRANSFER_TIME),
                    starttransfer=curl.getinfo(pycurl.STARTTRANSFER_TIME),
                    total=curl.getinfo(pycurl.TOTAL_TIME),
                )
                log.info(str(report))
                producer.produce(
                    topic=config.topic,
                    key=req,
                    value=report,
                    on_delivery=err
                )
            except TypeError:
                # It'll never work if we misconfigure PycURL.
                raise
            except pycurl.error as exc:
                # TODO: Record the failure in Kafka.
                log.warning(f"Failed to retrieve {req}", exc)
            # TODO: Handle exceptions from the Kafka Producer.
            finally:
                curl.close()
        except queue.Empty:
            log.debug("No request to process.")
    # Flush any results that haven't been committed yet.
    log.warning(f"Process {name} shutting down.")
    producer.flush()
Ejemplo n.º 18
0
def main(args):
    topic = args.topic

    schema_str = """
    {
      "$schema": "http://json-schema.org/draft-07/schema#",
      "title": "User",
      "description": "A Confluent Kafka Python User",
      "type": "object",
      "properties": {
        "name": {
          "description": "User's name",
          "type": "string"
        },
        "favorite_number": {
          "description": "User's favorite number",
          "type": "number",
          "exclusiveMinimum": 0
        },
        "favorite_color": {
          "description": "User's favorite color",
          "type": "string"
        }
      },
      "required": [ "name", "favorite_number", "favorite_color" ]
    }
    """
    schema_registry_conf = {'url': args.schema_registry}
    schema_registry_client = SchemaRegistryClient(schema_registry_conf)

    json_serializer = JSONSerializer(schema_registry_client, schema_str,
                                     user_to_dict)

    producer_conf = {
        'bootstrap.servers': args.bootstrap_servers,
        'key.serializer': StringSerializer('utf_8'),
        'value.serializer': json_serializer
    }

    producer = SerializingProducer(producer_conf)

    print("Producing user records to topic {}. ^C to exit.".format(topic))
    while True:
        # Serve on_delivery callbacks from previous calls to produce()
        producer.poll(0.0)
        try:
            user_name = input("Enter name: ")
            user_address = input("Enter address: ")
            user_favorite_number = int(input("Enter favorite number: "))
            user_favorite_color = input("Enter favorite color: ")
            user = User(name=user_name,
                        address=user_address,
                        favorite_color=user_favorite_color,
                        favorite_number=user_favorite_number)
            producer.produce(topic=topic,
                             key=str(uuid4()),
                             value=user,
                             on_delivery=delivery_report)
        except KeyboardInterrupt:
            break
        except ValueError:
            print("Invalid input, discarding record...")
            continue

    print("\nFlushing records...")
    producer.flush()
class TestConfluentProtobufProtobuf:
    def __init__(self, driver, nameSalt):
        self.driver = driver
        self.fileName = "travis_correct_confluent_protobuf_protobuf"
        self.topic = self.fileName + nameSalt

        self.sensor = sensor_pb2.SensorReading()
        self.sensor.dateTime = 1234
        self.sensor.reading = 321.321
        self.sensor.device.deviceID = "555-4321"
        self.sensor.device.enabled = True

        self.sensor.float_val = 4321.4321
        self.sensor.int32_val = (1 << 31) - 1
        self.sensor.sint32_val = (1 << 31) - 1
        self.sensor.sint64_val = (1 << 63) - 1
        self.sensor.uint32_val = (1 << 32) - 1

        self.sensor.bytes_val = b'\xDE\xAD'
        self.sensor.double_array_val.extend([1 / 3, 32.21, 434324321])
        self.sensor.uint64_val = (1 << 64) - 1

        self.schema_registry_client = SchemaRegistryClient(
            {'url': driver.schemaRegistryAddress})
        self.keyProtobufSerializer = ProtobufSerializer(
            sensor_pb2.SensorReading, self.schema_registry_client)
        self.valueProtobufSerializer = ProtobufSerializer(
            sensor_pb2.SensorReading, self.schema_registry_client)
        producer_conf = {
            'bootstrap.servers': driver.kafkaAddress,
            'key.serializer': self.keyProtobufSerializer,
            'value.serializer': self.valueProtobufSerializer
        }

        self.protobufProducer = SerializingProducer(producer_conf)

    def getConfigFileName(self):
        return self.fileName + ".json"

    def send(self):
        for e in range(100):
            self.protobufProducer.produce(self.topic, self.sensor, self.sensor)
            self.protobufProducer.poll(0)
        self.protobufProducer.flush()

    def verify(self, round):
        res = self.driver.snowflake_conn.cursor().execute(
            "SELECT count(*) FROM {}".format(self.topic)).fetchone()[0]
        if res == 0:
            raise RetryableError()
        elif res != 100:
            raise NonRetryableError(
                "Number of record in table is different from number of record sent"
            )

        # validate content of line 1
        res = self.driver.snowflake_conn.cursor().execute(
            "Select * from {} limit 1".format(self.topic)).fetchone()

        # "schema_id" is lost since they are using native avro converter
        goldMeta = r'{"CreateTime":\d*,"key":{"bytes_val":"3q0=","dateTime":1234,"device":' \
                   r'{"deviceID":"555-4321","enabled":true},"double_array_val":' \
                   r'[0.3333333333333333,32.21,4.343243210000000e+08],"float_val":4321.432,' \
                   r'"int32_val":2147483647,"reading":321.321,"sint32_val":2147483647,"sint64_val":9223372036854775807,' \
                   r'"uint32_val":4294967295,"uint64_val":-1},"offset":\d*,"partition":\d*,"topic":"travis_correct_confluent_protobuf_protobuf....."}'
        goldContent = r'{"bytes_val":"3q0=","dateTime":1234,"device":{"deviceID":"555-4321","enabled":true},"double_array_val":' \
                      r'[0.3333333333333333,32.21,4.343243210000000e+08],"float_val":4321.432,"int32_val":2147483647,' \
                      r'"reading":321.321,"sint32_val":2147483647,"sint64_val":9223372036854775807,"uint32_val":4294967295,"uint64_val":-1}'
        self.driver.regexMatchOneLine(res, goldMeta, goldContent)

        self.driver.verifyStageIsCleaned(self.topic)

    def clean(self):
        self.driver.cleanTableStagePipe(self.topic)
Ejemplo n.º 20
0
class KafkaLoggingHandler(logging.Handler):
    """
    This handler enables the user to forward logs to Kafka.

    Attributes:
        additional_fields (dict): extra fields attached to logs
        kafka_topic_name (str): topic name
        producer (kafka.KafkaProducer): producer object
    """

    __LOGGING_FILTER_FIELDS = [
        "msecs", "relativeCreated", "levelno", "created"
    ]

    def __init__(
        self,
        hosts_list,
        topic,
        security_protocol="SSL",
        ssl_cafile=None,
        extended_producer_config=None,
        additional_fields=None,
        log_preprocess=None,
        internal_logger_level="INFO",
        delivery_timeout=2,
    ):
        """
        Initialize the handler.

        Args:
            hosts_list: list of the Kafka hostnames
            topic: kafka consumer topic to where logs are forwarded
            security_protocol (str, optional): KafkaProducer security protocol
            ssl_cafile (None, optional): path to CA file
            extended_producer_config (None, optional):
                extra arguments to update confluent_kafka.SerializingProducer config
            additional_fields (None, optional):
                A dictionary with all the additional fields that you would like
                to add to the logs, such the application, environment, etc.
            log_preprocess (None/list, optional):
                list of functions, handler will send the following to Kafka
                ...preprocess[1](preprocess[0](raw_log))...
            internal_logger_level (str, optional):
                internal logger loglevel.
            delivery_timeout (int, optional):
                delivery timeout in seconds.

        Raises:
            KafkaLoggerException: in case of incorrect logger configuration

        """

        self._internal_logger = self._init_internal_logger(
            internal_logger_level)

        self.log_preprocess = log_preprocess or []

        self.additional_fields = additional_fields or {}
        self.additional_fields.update({
            "host":
            socket.gethostname(),
            "host_ip":
            socket.gethostbyname(socket.gethostname())
        })

        if security_protocol == "SSL" and ssl_cafile is None:
            raise KafkaLoggerException("SSL CA file isn't provided.")
        self.kafka_topic_name = topic
        self.delivery_timeout_sec = delivery_timeout
        extended_producer_config = extended_producer_config or {}
        producer_config = {
            "bootstrap.servers": hosts_list,
            "security.protocol": security_protocol,
            "ssl.ca.location": ssl_cafile,
            "key.serializer": StringSerializer("utf_8"),
            "value.serializer": lambda msg, _: json.dumps(msg).encode("utf-8"),
            "delivery.timeout.ms": self.delivery_timeout_sec * 1000,
            "error_cb": self.error_callback,
        }
        producer_config.update(extended_producer_config)

        self.producer = SerializingProducer(producer_config)

        logging.Handler.__init__(self)
        self._internal_logger.debug(
            f"KAFKA LOGGER INITIALIZED WITH CONFIG: {str(producer_config)}")

    @staticmethod
    def _init_internal_logger(level="INFO"):
        internal_handler = logging.StreamHandler(sys.stderr)
        internal_handler.setLevel(level)
        internal_handler.setFormatter(
            logging.Formatter(
                "[%(asctime)s] [%(process)s] [%(name)s] [%(levelname)s]: %(message)s"
            ))
        internal_logger = logging.getLogger("confluent_kafka_handler")
        internal_logger.addHandler(internal_handler)
        internal_logger.setLevel(level)
        internal_logger.propagate = False
        return internal_logger

    def prepare_record_dict(self, record):
        """
        Prepare a dictionary log item.

        Format a log record and extend dictionary with default values.

        Args:
            record (logging.LogRecord): log record

        Returns:
            dict: log item ready for Kafka
        """
        # use default formatting
        # Update the msg dict to include all of the message attributes
        self.format(record)

        # If there's an exception, let's convert it to a string
        if record.exc_info:
            record.msg = repr(record.msg)
            record.exc_info = repr(record.exc_info)

        # Append additional fields
        rec = self.additional_fields.copy()
        for key, value in record.__dict__.items():
            if key not in self.__LOGGING_FILTER_FIELDS:
                if key == "args":
                    # convert ALL argument to a str representation
                    # Elasticsearch supports number datatypes
                    # but it is not 1:1 - logging "inf" float
                    # causes _jsonparsefailure error in ELK
                    value = tuple(repr(arg) for arg in value)
                if key == "msg" and not isinstance(value, str):
                    # msg contains custom class object
                    # if there is no formatting in the logging call
                    value = str(value)
                rec[key] = "" if value is None else value
            if key == "created":
                # inspired by: cmanaha/python-elasticsearch-logger
                created_date = datetime.datetime.utcfromtimestamp(
                    record.created)
                rec["timestamp"] = "{!s}.{:03d}Z".format(
                    created_date.strftime("%Y-%m-%dT%H:%M:%S"),
                    int(created_date.microsecond / 1000))
        # apply preprocessor(s)
        for preprocessor in self.log_preprocess:
            rec = preprocessor(rec)

        return rec

    def emit(self, record):
        """
        Prepare and send LogRecord to kafka topic

        Args:
            record: Logging message
        """
        record_dict = self.prepare_record_dict(record)
        try:
            self.producer.produce(self.kafka_topic_name,
                                  value=record_dict,
                                  on_delivery=self.error_callback)
            self.producer.poll(0)
        except BufferError:
            self._internal_logger.error(
                "Confluent kafka queue is full, logs will be lost.")

    def error_callback(self, err, msg=None):
        if err:
            self._internal_logger.error(err)
        if msg:
            self._internal_logger.debug(msg)

    def flush(self):
        if hasattr(self, "producer"):
            self.producer.flush(self.delivery_timeout_sec + 0.1)
Ejemplo n.º 21
0
class DatahubKafkaSink(Sink):
    config: KafkaSinkConfig
    report: SinkReport

    def __init__(self, config: KafkaSinkConfig, ctx):
        super().__init__(ctx)
        self.config = config
        self.report = SinkReport()

        schema_registry_conf = {
            'url': self.config.connection.schema_registry_url,
            **self.config.connection.schema_registry_config,
        }
        schema_registry_client = SchemaRegistryClient(schema_registry_conf)

        def convert_mce_to_dict(mce: MetadataChangeEvent, ctx):
            tuple_encoding = mce.to_obj(tuples=True)
            return tuple_encoding

        avro_serializer = AvroSerializer(SCHEMA_JSON_STR,
                                         schema_registry_client,
                                         to_dict=convert_mce_to_dict)

        producer_config = {
            "bootstrap.servers": self.config.connection.bootstrap,
            'key.serializer': StringSerializer('utf_8'),
            'value.serializer': avro_serializer,
            **self.config.connection.producer_config,
        }

        self.producer = SerializingProducer(producer_config)

    @classmethod
    def create(cls, config_dict, ctx: PipelineContext):
        config = KafkaSinkConfig.parse_obj(config_dict)
        return cls(config, ctx)

    def handle_work_unit_start(self, workunit: WorkUnit) -> None:
        pass

    def handle_work_unit_end(self, workunit: WorkUnit) -> None:
        self.producer.flush()

    def write_record_async(
        self,
        record_envelope: RecordEnvelope[MetadataChangeEvent],
        write_callback: WriteCallback,
    ):
        # call poll to trigger any callbacks on success / failure of previous writes
        self.producer.poll(0)
        mce = record_envelope.record
        self.producer.produce(
            topic=self.config.topic,
            value=mce,
            on_delivery=KafkaCallback(self.report, record_envelope,
                                      write_callback).kafka_callback,
        )

    def get_report(self):
        return self.report

    def close(self):
        self.producer.flush()
import meal_pb2
from confluent_kafka import SerializingProducer
from confluent_kafka.serialization import StringSerializer
from confluent_kafka.schema_registry import SchemaRegistryClient
from confluent_kafka.schema_registry.protobuf import ProtobufSerializer

topic = 'meal'
schema_registry_client = SchemaRegistryClient({'url': 'http://t620.lan:8081'})
protobuf_serializer = ProtobufSerializer(meal_pb2.Meal, schema_registry_client)

producer_conf = {
    'bootstrap.servers': 't620.lan:9092',
    'key.serializer': StringSerializer('utf_8'),
    'value.serializer': protobuf_serializer
}

producer = SerializingProducer(producer_conf)

producer.poll(0.0)

mybeer = meal_pb2.Meal.DrinkItems(drink_name="beer")
mywine = meal_pb2.Meal.DrinkItems(drink_name="wine")

meal = meal_pb2.Meal(name='pizza', drink=[mybeer, mywine])
# Less meal (for testing kafkajs which seems to miss the drinks
#meal = meal_pb2.Meal(name='pizza', drink=[])

producer.produce(topic=topic, key=str(uuid4()), value=meal)
producer.flush()
Ejemplo n.º 23
0
class Broker:
    def __init__(self, consumer_topic, producer_topic, client_id,
                 bootstrap_servers, consumer_proto_class, producer_proto_class,
                 processor, max_thread_calls):
        self.consumer_topic = consumer_topic
        self.producer_topic = producer_topic
        self.client_id = client_id
        self.bootstrap_servers = bootstrap_servers
        self.consumer_proto_class = consumer_proto_class
        self.producer_proto_class = producer_proto_class
        self.processor = processor
        self.max_thread_calls = max_thread_calls

        self.kafka_consumer = DeserializingConsumer({
            'bootstrap.servers':
            self.bootstrap_servers,
            'group.id':
            self.client_id,
            'auto.offset.reset':
            "earliest",
            'value.deserializer':
            self.derializer
        })
        self.kafka_consumer.subscribe([self.consumer_topic])

        self.kafka_producer = SerializingProducer({
            'bootstrap.servers':
            self.bootstrap_servers,
            'queue.buffering.max.messages':
            500000,
            'value.serializer':
            self.serialize
        })

        self.thread_queue = deque(maxlen=self.max_thread_calls)
        self.latest_thread_queue_id = 1

    def derializer(self, bytes_message, _):
        message = image_pb2.ImageInfo()
        message.ParseFromString(bytes_message)
        return message

    def serialize(self, message, _):
        return message.SerializeToString()

    def get_thread_id(self):
        result = self.latest_thread_queue_id
        if result == self.max_thread_calls:
            self.latest_thread_queue_id = 1
        else:
            self.latest_thread_queue_id += 1
        return result

    def is_thread_queue_full(self):
        return len(self.thread_queue) == self.max_thread_calls

    def produce_when_ready(self, thread_id, message):
        while self.thread_queue[-1] != thread_id:
            logging.warning("Thread {} got stuck in queue".format(thread_id))
            # time.sleep(0.01)
        self.kafka_producer.poll(0.0)
        self.kafka_producer.produce(topic=self.producer_topic, value=message)
        self.thread_queue.pop()

    def call_processor(self, thread_id, value, start_time):
        result = self.processor.process(value)
        self.produce_when_ready(thread_id, result)
        logging.debug("Total time for thead" + str(thread_id) + " is " +
                      str(time.time() - start_time / 1000))

    def run(self):
        while True:
            try:
                if self.is_thread_queue_full():
                    logging.warning(
                        "Thread queue is full, waiting for previous threads to finished"
                    )
                    continue

                msg = self.kafka_consumer.poll(1.0)
                if msg is None or msg.value() is None:
                    logging.warning("No messages from kafka")
                    continue

                caller_thread_id = self.get_thread_id()
                caller_thread = threading.Thread(target=self.call_processor,
                                                 args=(caller_thread_id,
                                                       msg.value(),
                                                       msg.timestamp()[1]))
                self.thread_queue.appendleft(caller_thread_id)
                caller_thread.start()

            except KeyboardInterrupt:
                break

        self.kafka_consumer.close()
        self.kafka_producer.flush()
Ejemplo n.º 24
0
from confluent_kafka import SerializingProducer
from confluent_kafka.serialization import IntegerSerializer, StringSerializer


def callback(err, msg):
    if err is not None:
        print(f'Message deliver failed: {err}')
    else:
        print(f'Message delivered to {msg.topic()} [{msg.partition()}]')


p = SerializingProducer({
    'bootstrap.servers': 'localhost:9092',
    'key.serializer': IntegerSerializer(),
    'value.serializer': StringSerializer()
})


for i in range(100):
    polling_result = p.poll(0)
    if polling_result:
        print(f'Polling result: {polling_result}')
    p.produce('sample-topic', key=i, value=f'hello world {i}', on_delivery=callback)

p.flush()

Ejemplo n.º 25
0
    delivered_records = 0
    google_news = gs.google_top_results(num_articles,
                                        '/search?q=chicago&tbm=nws')
    print('loop should run {} times'.format(len(google_news)))
    print("for these url's", google_news)
    for num in range(len(google_news)):
        # print("begin producing record {}".format(num+1))
        url = google_news.iloc[num]
        # print("url key for message is", url)
        text = gs.html_to_string(url)
        # print("dirty text for message is", text[:20])
        news = gs.clean_news(text, 20)
        # print("cleaned news value for message is", news)
        scraper_dt = datetime.now(pytz.timezone('America/Denver'))
        scraper_dt = scraper_dt.strftime("%Y/%m/%d %H:%M:%S %z")
        value_obj = google.Value(text=news.to_string(index=False),
                                 scraper_dt=scraper_dt)
        key_obj = google.Key(url=str(url))
        print("Producing record: {}\t{}".format(key_obj.url,
                                                value_obj.text[:10]))
        producer.produce(topic=topic,
                         key=key_obj,
                         value=value_obj,
                         on_delivery=kafka_utils.acked)
        delivered_records += producer.poll()

    producer.flush()

    print("{} messages were produced to topic {}!".format(
        delivered_records, topic))