def typed_dict_from_schema_file(schema_path, referenced_schema_files=None): if referenced_schema_files: referenced_schema_files.append(schema_path) schema = load_schema_ordered(referenced_schema_files) else: schema = expand_schema(load_schema(schema_path)) return types_for_schema(schema)
def convert_to_avro(data: dict, compress_type: str, avro_schema: str) -> memoryview: fo = io.BytesIO() parsed_schema = load_schema(avro_schema) writer(fo, parsed_schema, data, codec=compress_type) fo.seek(0) return fo.getbuffer()
def custom_schema_test(self): def encode_and_decode(schema_definition): avro_schema = AvroSchema(None, schema_definition=schema_definition) company = { "name": "company-name", "address": 'xxx road xxx street', "employees": [ {"name": "user1", "age": 25}, {"name": "user2", "age": 30}, {"name": "user3", "age": 35}, ], "labels": { "industry": "software", "scale": ">100", "funds": "1000000.0" }, "companyType": "companyType1" } data = avro_schema.encode(company) company_decode = avro_schema.decode(data) self.assertEqual(company, company_decode) schema_definition = { 'doc': 'this is doc', 'namespace': 'example.avro', 'type': 'record', 'name': 'Company', 'fields': [ {'name': 'name', 'type': ['null', 'string']}, {'name': 'address', 'type': ['null', 'string']}, {'name': 'employees', 'type': ['null', {'type': 'array', 'items': { 'type': 'record', 'name': 'Employee', 'fields': [ {'name': 'name', 'type': ['null', 'string']}, {'name': 'age', 'type': ['null', 'int']} ] }}]}, {'name': 'labels', 'type': ['null', {'type': 'map', 'values': 'string'}]}, {'name': 'companyType', 'type': ['null', {'type': 'enum', 'name': 'CompanyType', 'symbols': ['companyType1', 'companyType2', 'companyType3']}]} ] } encode_and_decode(schema_definition) # Users could load schema from file by `fastavro.schema` # Or use `avro.schema` like this `avro.schema.parse(open("examples/company.avsc", "rb").read()).to_json()` encode_and_decode(load_schema("examples/company.avsc"))
def read_schemas_and_generate_classes( schema_files: Dict[str, List[str]], output_type: OutputType, *, run_black: bool = True, namespace_prefix: str = "", output_dir: str = ".", ) -> None: for name, files in schema_files.items(): print(f"Parsing schema/s for {name}") if len(files) == 1: schema = load_schema(files[0]) else: schema = load_schema_ordered(files) print(f"Generate {output_type.value}s...") generate_classes( schema, output_type, run_black=run_black, namespace_prefix=namespace_prefix, output_dir=output_dir, )
def custom_schema_produce_and_consume_test(self): client = pulsar.Client(self.serviceUrl) def produce_and_consume(topic, schema_definition): print('custom schema produce and consume test topic - ', topic) example_avro_schema = AvroSchema( None, schema_definition=schema_definition) producer = client.create_producer(topic=topic, schema=example_avro_schema) consumer = client.subscribe(topic, 'test', schema=example_avro_schema) for i in range(0, 10): company = { "name": "company-name" + str(i), "address": 'xxx road xxx street ' + str(i), "employees": [ { "name": "user" + str(i), "age": 20 + i }, { "name": "user" + str(i), "age": 30 + i }, { "name": "user" + str(i), "age": 35 + i }, ], "labels": { "industry": "software" + str(i), "scale": ">100", "funds": "1000000.0" }, "companyType": "companyType" + str((i % 3) + 1) } producer.send(company) for i in range(0, 10): msg = consumer.receive() company = { "name": "company-name" + str(i), "address": 'xxx road xxx street ' + str(i), "employees": [ { "name": "user" + str(i), "age": 20 + i }, { "name": "user" + str(i), "age": 30 + i }, { "name": "user" + str(i), "age": 35 + i }, ], "labels": { "industry": "software" + str(i), "scale": ">100", "funds": "1000000.0" } } self.assertEqual(msg.value(), company) consumer.acknowledge(msg) consumer.close() producer.close() schema_definition = { 'doc': 'this is doc', 'namespace': 'example.avro', 'type': 'record', 'name': 'Company', 'fields': [{ 'name': 'name', 'type': ['null', 'string'] }, { 'name': 'address', 'type': ['null', 'string'] }, { 'name': 'employees', 'type': [ 'null', { 'type': 'array', 'items': { 'type': 'record', 'name': 'Employee', 'fields': [{ 'name': 'name', 'type': ['null', 'string'] }, { 'name': 'age', 'type': ['null', 'int'] }] } } ] }, { 'name': 'labels', 'type': ['null', { 'type': 'map', 'values': 'string' }] }] } produce_and_consume('custom-schema-test-1', schema_definition=schema_definition) produce_and_consume( 'custom-schema-test-2', schema_definition=load_schema("examples/company.avsc")) client.close()
pfb_schema = None with open("pfb.Entity.avsc", "r") as Entity: # load Entity schema as simple json pfb_schema_json = json.load(Entity) # find the "object" hash [object] = [x for x in pfb_schema_json["fields"] if x["name"] == "object"] # add the custom schemas (as names) to the object.type array object["type"].extend(["icdc.case", "icdc.cohort"]) # dump json to a tempfile to take advantage of fastavro avsc # name resolution in fastavro.schema.load_schema() tf = NamedTemporaryFile(mode="w+", dir=".") json.dump(pfb_schema_json, tf) tf.seek(0) # load the customized schema pfb_schema = load_schema(tf.name) pass # metadata for PFB message: # note that unused fields still must be defined, since in the schema # null is not allowed (which seems more like oversight than intention) icdc_cohort_meta = { "name": "icdc.cohort", "ontology_reference": "", "values": {}, "links": [], "properties": [{ "name": "cohort_description",
def typed_dict_from_schema_file(schema_path): schema = expand_schema(load_schema(schema_path)) return types_for_schema(schema)
'triple_norm': [sbj_norm, rel_norm, obj_norm], 'true_link': { 'subject': sbj[0]['w_link']['wiki_link'], 'object': obj[0]['w_link']['wiki_link'] }, 'src_sentences': [src_sentence] } dataset.append(triple_record) return dataset def build_datasets(args): i, file_name = args return read_avro(file_name) AVRO_SCHEMA_FILE = './avroschema/WikiArticleLinkedNLP.avsc' AVRO_FOLDER = './data/OPIEC-Linked-triples/' OUTPUT_FOLDER = './data/opiec-for-canonical/' avro_folder = Path(AVRO_FOLDER) AVRO_FILES = avro_folder.glob('*.avro') schema = load_schema(AVRO_SCHEMA_FILE) with Pool(10) as p: datasets = p.map(build_datasets, enumerate(AVRO_FILES)) dataset = sum(datasets, []) with open(f'{OUTPUT_FOLDER}/triples.json', 'w') as f: json.dump(dataset, f)