コード例 #1
0
def produce(config, topic, input_messages):
    """
        produce initiate sending a message to Kafka, call the produce method passing in the input_messages key/value
        and and callback
    Parameters
    ----------
        topic: str
            topic where the input message publish too
        input_messages: dict
            a key/value input messages
        config: dict
            the config values that needed by the produce

     """
    if topic is None:
        logger.debug('Required topic field must be set')
        raise ValueError()

    if len(input_messages) <= 0:
        logger.debug('Required data field must not be empty.')
        raise ValueError()

    bootstrap_servers, schema_registry = producer_config(config)

    producer = Producer(bootstrap_servers)
    admin_client = AdminClient(bootstrap_servers)
    topics = admin_client.list_topics().topics
    #Just to show what's available
    print(topics)

    if not topics:
        print('Not Topics')
        raise RuntimeError()

    sr = CachedSchemaRegistryClient(schema_registry)
    ser = MessageSerializer(sr)
    # get schema
    id, schema, version = sr.get_latest_schema(topic + "-value")
    if schema:
        print('In If Schema')
        for key, value in input_messages.items():
            if validate_uuid4(key):
                print('In validate in For loop')
                serializedMessage = ser.encode_record_with_schema(
                    topic, schema, value)
                producer.produce(topic=topic,
                                 key=key,
                                 value=serializedMessage,
                                 callback=acked)
                # producer.flush() # bad idea, it limits throughput to the broker round trip time
                producer.poll(1)
            else:
                print('In Else of For Loop')
                logger.error('Invalid UUID String: ', key)

    else:
        print('Schema not found for topic name: ', topic)
        print('In Else Schema')
    sys.exit(1)
コード例 #2
0
class AvroMessageSerializer(object):

    def __init__(self, schema_subject, schema_registry_url):
        ''' Create a new serializer object, which includes the remote-loaded
        schema object specified by schema_subject.

        Note this constructor is not exception safe
        '''

        self.schema_subject = schema_subject
        self.schema_registry_url = schema_registry_url
        self.schema_registry_client = CachedSchemaRegistryClient(url=self.schema_registry_url)

        self._load_schema()

        self.writer = SchemalessAvroRecordWriter(self.avro_schema)

    def _load_schema(self):
        try:
            schema_tuple = self.schema_registry_client.get_latest_schema(subject=self.schema_subject)
        except ValueError as e:
            raise ValueError('Schema subject ' + self.schema_subject + ' not found')

        if not schema_tuple[1]:
            raise ValueError('Schema subject ' + self.schema_subject + ' not found')

        self.schema_id = schema_tuple[0]
        self.avro_schema = schema_tuple[1].to_json()
        self.schema_version = schema_tuple[2]

    def kafka_avro_encode(self, record):
        with ContextBytesIO() as buf:
            # write the header
            # magic byte
            buf.write(struct.pack('b', _MAGIC_BYTE))
            # write the schema ID in network byte order (big end)
            buf.write(struct.pack('>I', self.schema_id))
            self.writer.write(buf, record)
            return buf.getvalue()
コード例 #3
0
ファイル: kafka_etl.py プロジェクト: yhjyoon/datahub
    record_schema = avro.load(AVROLOADPATH)
    producer = AvroProducer(conf, default_value_schema=record_schema)

    try:
        producer.produce(topic=KAFKATOPIC, value=mce)
        producer.poll(0)
        sys.stdout.write('\n%s has been successfully produced!\n' % mce)
    except ValueError as e:
        sys.stdout.write('Message serialization failed %s' % e)
    producer.flush()


zk = KazooClient(ZOOKEEPER)
zk.start()
client = CachedSchemaRegistryClient(SCHEMAREGISTRY)

topics = zk.get_children("/brokers/topics")

for dataset_name in topics:
    if dataset_name.startswith('_'):
        continue
    topic = dataset_name + '-value'
    schema_id, schema, schema_version = client.get_latest_schema(topic)
    if schema_id is None:
        print(f"Skipping topic without schema: {topic}")
        continue

    print(topic)
    build_kafka_dataset_mce(dataset_name, str(schema), int(schema_version))

sys.exit(0)
@udf(BinaryType())
def find_header_value(array, headerName):
    return next(x for x in array if x['key'] == headerName)['value']


schema_registry_client = CachedSchemaRegistryClient({
    "url":
    "https://psrc-4j1d2.westus2.azure.confluent.cloud",
    "basic.auth.credentials.source":
    "USER_INFO",
    "basic.auth.user.info":
    "2BEQE2KDNBJGDH2Y:8nixndjUyjXqTJoXnm3X3GwLZPz5F8umq74/g9ioG2mIi4lm0CWF1nUAf8deIFbP"
})

latest_id, latest_schema, latest_version = schema_registry_client.get_latest_schema(
    "transmissao-efetuada-value")

spark = SparkSession \
    .builder \
    .appName("CapturarEventosJob") \
    .master("local[*]") \
    .getOrCreate()

spark.sparkContext.setLogLevel('WARN')

spark.udf.register("find_header_value", find_header_value)

raw_data = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "pkc-epwny.eastus.azure.confluent.cloud:9092") \
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql.functions import col, expr, struct, lit, concat, array, date_format, current_timestamp
from pyspark.sql.avro.functions import to_avro
from confluent_kafka.avro.cached_schema_registry_client import CachedSchemaRegistryClient

schema_registry_client = CachedSchemaRegistryClient({
    "url":
    "https://psrc-4j1d2.westus2.azure.confluent.cloud",
    "basic.auth.credentials.source":
    "USER_INFO",
    "basic.auth.user.info":
    "2BEQE2KDNBJGDH2Y:8nixndjUyjXqTJoXnm3X3GwLZPz5F8umq74/g9ioG2mIi4lm0CWF1nUAf8deIFbP"
})

latest_id, latest_schema, latest_version = schema_registry_client.get_latest_schema(
    "relatorio-transmissao-value")

magic_byte = bytes([0x0])

id_bytes = (latest_id).to_bytes(4, byteorder='big')

spark = SparkSession \
    .builder \
    .appName("GerarRelatorioTransmissaoJob") \
    .master("local[*]") \
    .getOrCreate()

sqlContext = SQLContext(spark.sparkContext)

spark.sparkContext.setLogLevel('WARN')
コード例 #6
0
    config = json.load(config_file)

Stats = namedtuple('Stats', [
    'time', 'ifcb_id', 'roi', 'name', 'classifier', 'prob',
    'classification_time', 'biovolume', 'carbon', 'hab'
])
ClassifierStats = namedtuple(
    'ClassifierStats',
    ['sample_name', 'prob', 'classifier', 'classification_time'])

schema_config = {'url': config['schema.registry.url'], 'ssl.ca.location': None}
# need to use CachedSchemaRegistryClient to get schema
# - need to copy config because it is consumed when used in CachedSchemaRegistryClient
schema_config_copy = schema_config.copy()
cached_schema_client = CachedSchemaRegistryClient(schema_config)
key_schema = str(cached_schema_client.get_latest_schema('ifcb-stats-key')[1])
value_schema = str(
    cached_schema_client.get_latest_schema('ifcb-stats-value')[1])

key_schema = avro.loads(key_schema)
value_schema = avro.loads(value_schema)
producer = AvroProducer(
    {
        'bootstrap.servers': config['bootstrap.servers'],
        'schema.registry.url': config['schema.registry.url']
    },
    default_key_schema=key_schema,
    default_value_schema=value_schema)

app = faust.App(config['app_name'],
                broker=config['broker'],
コード例 #7
0
import json
import os

config = {
    'bootstrap.servers': 'localhost:9092',
    'client.id': 'phoenix-local-producer'
}

sr_config = {'url': 'http://localhost:8081', 'auto.register.schemas': False}

topic = 'avro-topic-1'
suffix = '-value'
subject = topic + suffix

sr_client = CachedSchemaRegistryClient(sr_config)
schema_details = sr_client.get_latest_schema(subject)
RECORD_SCHEMA = schema_details[1]

serializer = MessageSerializer(sr_client)


class AvroModel:
    def __init__(self, id, name):
        self.id = id
        self.name = name


data = AvroModel(None, None)

logging.getLogger().setLevel(logging.INFO)