def test_select(cluster): # type: (ClickHouseCluster) -> None schema_registry_client = cluster.schema_registry_client serializer = MessageSerializer(schema_registry_client) schema = avro.schema.make_avsc_object({ 'name': 'test_record', 'type': 'record', 'fields': [{ 'name': 'value', 'type': 'long' }] }) buf = io.BytesIO() for x in range(0, 3): message = serializer.encode_record_with_schema('test_subject', schema, {'value': x}) buf.write(message) data = buf.getvalue() instance = cluster.instances["dummy"] # type: ClickHouseInstance schema_registry_url = "http://{}:{}".format(cluster.schema_registry_host, cluster.schema_registry_port) run_query(instance, "create table avro_data(value Int64) engine = Memory()") settings = {'format_avro_schema_registry_url': schema_registry_url} run_query(instance, "insert into avro_data format AvroConfluent", data, settings) stdout = run_query(instance, "select * from avro_data") assert list(map(str.split, stdout.splitlines())) == [ ["0"], ["1"], ["2"], ]
class StdOutListener(tweepy.StreamListener): def on_status(self, message): logging.debug(datetime.now().strftime("%A, %d. %B %Y %I:%M%p")) try: mls = createPayload(message) print mls timevar = datetime.utcnow() - datetime.strptime( mls[11], '%Y-%m-%d %H:%M:%S') print datetime.utcnow(), datetime.strptime(mls[11], '%Y-%m-%d %H:%M:%S') print "Minutes and Seconds : ", divmod( timevar.days * 86400 + timevar.seconds, 60) except Exception, e: logging.debug( 'There was an error in creating the payload. The error is: %s' % e) print 'Error in Payload Creation : ', str(e) twitter_utils.sendErrorMail( 'There was an error in creating the payload. The error is %s' % e) return True try: #converts the payload into the avro format in preparation for loading into hbase avro_schema = Util.parse_schema_from_string( open('/**/**/twitter.avsc').read()) client = CachedSchemaRegistryClient(url='http://192.168.**:8081') schema_id = client.register('twitter_avro_schema_stream4', avro_schema) avro_schema = client.get_by_id(schema_id) schema_id, avro_schema, schema_version = client.get_latest_schema( 'twitter_avro_schema_stream4') schema_version = client.get_version('twitter_avro_schema_stream4', avro_schema) serializer = MessageSerializer(client) encoded = serializer.encode_record_with_schema( topicname, avro_schema, { "authid": mls[0], "screen_name": mls[1], "description": mls[2], "favourites_count": convert_long(mls[3]), "followers_count": convert_long(mls[4]), "friends_count": convert_long(mls[5]), "listed_count": convert_long(mls[6]), "location": mls[7], "id_str": mls[8], "time_zone": mls[9], "statuses_count": convert_long(mls[10]), "created_at": mls[11], "favorite_count": convert_long(mls[12]), "tid": mls[13], "in_reply_to_status_id_str": mls[14], "in_reply_to_user_id_str": mls[15], "lang": mls[16], "possibly_sensitive": mls[17], "retweet_count": convert_long(mls[18]), "text": mls[19], "entities_url": mls[20], "entities_expanded_url": mls[21], "entities_media_url": mls[22], "disgust": convert_long(mls[23]), "fear": convert_long(mls[24]), "sadness": convert_long(mls[25]), "surprise": convert_long(mls[26]), "trust": convert_long(mls[27]), "negative": convert_long(mls[28]), "positive": convert_long(mls[29]), "neutral": convert_long(mls[30]), "celebrities": (mls[31]), "events": (mls[32]), "brands": (mls[33]), "accessories": (mls[34]) }) except Exception, e: logging.debug( 'There was an error in the generation of the avro file. The error is: %s' % e) print 'Error in avro generation : ', e print mls twitter_utils.sendErrorMail( 'There was an error in the generation of the avro file. The error is %s. This is likely due to an error in the schema. Please check the schema file under twitter_avro_schema.avsc' % e) return True
def setUp(self): # need to set up the serializer self.client = MockSchemaRegistryClient() self.ms = MessageSerializer(self.client)
def get_message_serializer(self): schema_registry_url = self.get_schema_registry_url() logger.debug('loading schema registry: ' + schema_registry_url) schema_client = CachedSchemaRegistryClient(url=schema_registry_url) return MessageSerializer(schema_client)
from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.streaming.kafka import KafkaUtils from sys import argv from config import KAFKA_URL, KAFKA_BROKER_LIST topic = argv[1] #enter topic as parameter when running script schema_registry_url = argv[ 2] # enter schema registry url (ex. http://localhost:8081) if len(argv) > 3 and argv[3] == 'reset': auto_offset_reset = 'smallest' else: auto_offset_reset = 'largest' schema_registry_client = CachedSchemaRegistryClient(url=schema_registry_url) serializer = MessageSerializer(schema_registry_client) # simple decode to replace Kafka-streaming's built-in decode decoding UTF8 () def decoder(s): decoded_message = serializer.decode_message(s) return decoded_message # Spark Streaming from Kafka master = 'local[2]' app_name = 'kafka_consumer' sc = SparkContext(master, app_name) ssc = StreamingContext(sc, 60) kvs = KafkaUtils.createDirectStream(ssc, [topic], { "metadata.broker.list": KAFKA_BROKER_LIST,
def writeToavro(p, mls): #converts the payload into the avro format in preparation for loading into hbase try: avro_schema = Util.parse_schema_from_string( open('/root/quest/twitter_avro_schema.avsc').read()) client = CachedSchemaRegistryClient(url='http://192.168.111.12:8081') schema_id = client.register('twitter_avro__schema_stream4', avro_schema) avro_schema = client.get_by_id(schema_id) schema_id, avro_schema, schema_version = client.get_latest_schema( 'twitter_avro__schema_stream4') schema_version = client.get_version('twitter_avro__schema_stream4', avro_schema) serializer = MessageSerializer(client) encoded = serializer.encode_record_with_schema( topicname, avro_schema, { "authid": mls[0], "screen_name": mls[1], "description": mls[2], "favourites_count": convert_long(mls[3]), "followers_count": convert_long(mls[4]), "friends_count": convert_long(mls[5]), "listed_count": convert_long(mls[6]), "location": mls[7], "id_str": mls[8], "time_zone": mls[9], "statuses_count": convert_long(mls[10]), "created_at": mls[11], "favorite_count": convert_long(mls[12]), "tid": mls[13], "in_reply_to_status_id_str": mls[14], "in_reply_to_user_id_str": mls[15], "lang": mls[16], "possibly_sensitive": mls[17], "retweet_count": convert_long(mls[18]), "text": mls[19], "entities_url": mls[20], "entities_expanded_url": mls[21], "entities_media_url": mls[22], "disgust": convert_long(mls[23]), "fear": convert_long(mls[24]), "sadness": convert_long(mls[25]), "surprise": convert_long(mls[26]), "trust": convert_long(mls[27]), "negative": convert_long(mls[28]), "positive": convert_long(mls[29]), "neutral": convert_long(mls[30]), "celebrities": (mls[31]), "events": (mls[32]), "brands": (mls[33]), "accessories": (mls[34]) }) except Exception, e: logging.debug( 'There was an error in the generation of the avro file. The error is: %s' % e) print 'Error in avro generation : ', e print mls twitter_utils.sendErrorMail( 'There was an error in the generation of the avro file. The error is %s' % e) return True