def main(): "main foo happening here, alright!" usage = "%prog [Options]" parser = OptionParser(usage=usage, version="%prog v0.1") parser.add_option("-f", "--file", dest="avro", help="an Avro file to read from", metavar="some/file.avro") parser.add_option("-s", "--schema", dest="schema", default=None, action="store_true", help="only extract and return the avro-schema in JSON") parser.add_option("-i", "--pretty", dest="pretty", default=None, action="store_true", help="indent [and keysort, if schema] any JSON on the output") parser.add_option("-n", "--number", dest="num", type="int", default=5, help="integer number of lines to put out") parser.add_option("-d", "--destination", dest="dest", help="optional destination file to write to") (opts, arg) = parser.parse_args() if not opts.avro: print "We at least need a file, ok?" parser.print_help() sys.exit(2) if not opts.dest and opts.schema: schema = get_schema(opts.avro) if not opts.pretty: print schema.to_json() else: print json.dumps(schema.to_json(), sort_keys=True, indent=opts.pretty) # I know, it's silly elif not opts.dest and not opts.schema: print json.dumps(head_avro(opts.avro, opts.num), indent=opts.pretty) else: write_avro(opts.dest, opts.num, opts.avro)
def combine_schemas(schema_files): """Combine multiple nested schemas into a single schema. """ known_schemas = avro.schema.Names() for s in schema_files: schema = load_single_avsc(s, known_schemas) return schema.to_json()
def main(): # Create Avro schema test_schema = ''' { "namespace": "example.avro", "type": "record", "name": "SampleLog", "fields": [ {"name": "name", "type": "string"}, {"name": "value", "type": "float"}, {"name": "time", "type": "float"}, {"name": "datetime", "type": "string"} ] } ''' schema = avro.schema.parse(test_schema) writer = avro.io.DatumWriter(schema) # Create producer conf = {'bootstrap.servers': 'localhost'} p = Producer(**conf) print 'Schema:' schema_string = json.dumps(schema.to_json()).encode('utf-8') schema_string = "".join(schema_string.split()) print schema_string # Topic topic_name = 'demo_9' # Register schema schema_reg_url = 'http://localhost:8081' r = requests.post( schema_reg_url + '/subjects/' + topic_name + '-value/versions', data=json.dumps({ 'schema': schema_string }).encode('utf-8'), headers={'Content-Type': 'application/vnd.schemaregistry.v1+json'}) schema_id = json.loads(r.text)["id"] print 'Schema id: ' + str(json.loads(r.text)["id"]) # Create devices devices = [device.SampleTemperature(), device.Oscillator()] # Time step loop timestep = 1 # seconds current_time = 0 while True: time.sleep(timestep) current_time += timestep print current_time for dev in devices: p.produce(topic_name, value=create_avro_message(dev, writer, schema_id)) dev.update(timestep) p.flush()
def combineSchemas(schema_files): """Combine multiple nested schemas into a single schema. Parameters ---------- schema_files : `list` List of files containing schemas. If nested, most internal schema must be first. Returns ------- `dict` Avro schema """ known_schemas = avro.schema.Names() for s in schema_files: schema = _loadSingleAvsc(s, known_schemas) return schema.to_json()
def get_schema(self, schema_dumper): data = schema_dumper.stdout.getvalue() schema = avro.schema.parse(data) return schema.to_json()
outdir = "avro-data" if not os.path.exists(outdir): os.makedirs(outdir) with open(f"avro/{document}.schema.json", "r") as f: schema_data = f.read() schema = avro.schema.Parse(schema_data) outfile = open(f"{outdir}/{document}.avro", "wb") writer = avro.datafile.DataFileWriter(outfile, avro.io.DatumWriter(), schema) with open(f"data/{document}.ndjson", "r") as f: data = f.readlines() try: orig = None for line in data: orig = json.loads(line) out = convert(orig, schema) writer.append(out) except: with open("test.json", "w") as f: json.dump(orig, f) with open("test-schema.json", "w") as f: json.dump(schema.to_json(), f, indent=2) validation.validate(out, parse_schema(schema.to_json())) writer.close()
from kafka import SimpleProducer, KafkaProducer from kafka import KafkaClient from time import time # To send messages synchronously producer = KafkaProducer(bootstrap_servers = "localhost:9092", compression_type = "gzip") # Kafka topic topic = "tnx" # Path to user.avsc avro schema schema_path = "/home/cloudera/workspace/kafka-clients-python/transactions.avsc" schema = avro.schema.Parse(open(schema_path).read()) print("Schema", schema.to_json()) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) def get_record(): return {"id": "123" , "merchant_id": "m123" , "customer_id": "c345" , "amount": 100.1 , "category": "pos" , "timestamp": int(time())} for i in range(10):
from avro.io import DatumWriter from kafka import SimpleProducer, KafkaProducer from kafka import KafkaClient from time import time # To send messages synchronously producer = KafkaProducer(bootstrap_servers="localhost:9092", compression_type="gzip") # Kafka topic topic = "tnx" # Path to user.avsc avro schema schema_path = "/home/cloudera/workspace/kafka-clients-python/transactions.avsc" schema = avro.schema.Parse(open(schema_path).read()) print("Schema", schema.to_json()) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) def get_record(): return { "id": "123", "merchant_id": "m123", "customer_id": "c345", "amount": 100.1, "category": "pos", "timestamp": int(time()) }