def typed_dict_from_schema_file(schema_path, referenced_schema_files=None):
    if referenced_schema_files:
        referenced_schema_files.append(schema_path)
        schema = load_schema_ordered(referenced_schema_files)
    else:
        schema = expand_schema(load_schema(schema_path))
    return types_for_schema(schema)
Ejemplo n.º 2
0
def convert_to_avro(data: dict, compress_type: str,
                    avro_schema: str) -> memoryview:
    fo = io.BytesIO()
    parsed_schema = load_schema(avro_schema)
    writer(fo, parsed_schema, data, codec=compress_type)
    fo.seek(0)
    return fo.getbuffer()
Ejemplo n.º 3
0
    def custom_schema_test(self):

        def encode_and_decode(schema_definition):
            avro_schema = AvroSchema(None, schema_definition=schema_definition)

            company = {
                "name": "company-name",
                "address": 'xxx road xxx street',
                "employees": [
                    {"name": "user1", "age": 25},
                    {"name": "user2", "age": 30},
                    {"name": "user3", "age": 35},
                ],
                "labels": {
                    "industry": "software",
                    "scale": ">100",
                    "funds": "1000000.0"
                },
                "companyType": "companyType1"
            }
            data = avro_schema.encode(company)
            company_decode = avro_schema.decode(data)
            self.assertEqual(company, company_decode)

        schema_definition = {
            'doc': 'this is doc',
            'namespace': 'example.avro',
            'type': 'record',
            'name': 'Company',
            'fields': [
                {'name': 'name', 'type': ['null', 'string']},
                {'name': 'address', 'type': ['null', 'string']},
                {'name': 'employees', 'type': ['null', {'type': 'array', 'items': {
                    'type': 'record',
                    'name': 'Employee',
                    'fields': [
                        {'name': 'name', 'type': ['null', 'string']},
                        {'name': 'age', 'type': ['null', 'int']}
                    ]
                }}]},
                {'name': 'labels', 'type': ['null', {'type': 'map', 'values': 'string'}]},
                {'name': 'companyType', 'type': ['null', {'type': 'enum', 'name': 'CompanyType', 'symbols':
                    ['companyType1', 'companyType2', 'companyType3']}]}
            ]
        }
        encode_and_decode(schema_definition)
        # Users could load schema from file by `fastavro.schema`
        # Or use `avro.schema` like this `avro.schema.parse(open("examples/company.avsc", "rb").read()).to_json()`
        encode_and_decode(load_schema("examples/company.avsc"))
Ejemplo n.º 4
0
def read_schemas_and_generate_classes(
    schema_files: Dict[str, List[str]],
    output_type: OutputType,
    *,
    run_black: bool = True,
    namespace_prefix: str = "",
    output_dir: str = ".",
) -> None:
    for name, files in schema_files.items():
        print(f"Parsing schema/s for {name}")
        if len(files) == 1:
            schema = load_schema(files[0])
        else:
            schema = load_schema_ordered(files)
        print(f"Generate {output_type.value}s...")
        generate_classes(
            schema,
            output_type,
            run_black=run_black,
            namespace_prefix=namespace_prefix,
            output_dir=output_dir,
        )
Ejemplo n.º 5
0
    def custom_schema_produce_and_consume_test(self):
        client = pulsar.Client(self.serviceUrl)

        def produce_and_consume(topic, schema_definition):
            print('custom schema produce and consume test topic - ', topic)
            example_avro_schema = AvroSchema(
                None, schema_definition=schema_definition)

            producer = client.create_producer(topic=topic,
                                              schema=example_avro_schema)
            consumer = client.subscribe(topic,
                                        'test',
                                        schema=example_avro_schema)

            for i in range(0, 10):
                company = {
                    "name":
                    "company-name" + str(i),
                    "address":
                    'xxx road xxx street ' + str(i),
                    "employees": [
                        {
                            "name": "user" + str(i),
                            "age": 20 + i
                        },
                        {
                            "name": "user" + str(i),
                            "age": 30 + i
                        },
                        {
                            "name": "user" + str(i),
                            "age": 35 + i
                        },
                    ],
                    "labels": {
                        "industry": "software" + str(i),
                        "scale": ">100",
                        "funds": "1000000.0"
                    },
                    "companyType":
                    "companyType" + str((i % 3) + 1)
                }
                producer.send(company)

            for i in range(0, 10):
                msg = consumer.receive()
                company = {
                    "name":
                    "company-name" + str(i),
                    "address":
                    'xxx road xxx street ' + str(i),
                    "employees": [
                        {
                            "name": "user" + str(i),
                            "age": 20 + i
                        },
                        {
                            "name": "user" + str(i),
                            "age": 30 + i
                        },
                        {
                            "name": "user" + str(i),
                            "age": 35 + i
                        },
                    ],
                    "labels": {
                        "industry": "software" + str(i),
                        "scale": ">100",
                        "funds": "1000000.0"
                    }
                }
                self.assertEqual(msg.value(), company)
                consumer.acknowledge(msg)

            consumer.close()
            producer.close()

        schema_definition = {
            'doc':
            'this is doc',
            'namespace':
            'example.avro',
            'type':
            'record',
            'name':
            'Company',
            'fields': [{
                'name': 'name',
                'type': ['null', 'string']
            }, {
                'name': 'address',
                'type': ['null', 'string']
            }, {
                'name':
                'employees',
                'type': [
                    'null', {
                        'type': 'array',
                        'items': {
                            'type':
                            'record',
                            'name':
                            'Employee',
                            'fields': [{
                                'name': 'name',
                                'type': ['null', 'string']
                            }, {
                                'name': 'age',
                                'type': ['null', 'int']
                            }]
                        }
                    }
                ]
            }, {
                'name': 'labels',
                'type': ['null', {
                    'type': 'map',
                    'values': 'string'
                }]
            }]
        }
        produce_and_consume('custom-schema-test-1',
                            schema_definition=schema_definition)
        produce_and_consume(
            'custom-schema-test-2',
            schema_definition=load_schema("examples/company.avsc"))

        client.close()
Ejemplo n.º 6
0
pfb_schema = None
with open("pfb.Entity.avsc", "r") as Entity:
    # load Entity schema as simple json
    pfb_schema_json = json.load(Entity)
    # find the "object" hash
    [object] = [x for x in pfb_schema_json["fields"] if x["name"] == "object"]
    # add the custom schemas (as names) to the object.type array
    object["type"].extend(["icdc.case", "icdc.cohort"])
    # dump json to a tempfile to take advantage of fastavro avsc
    # name resolution in fastavro.schema.load_schema()
    tf = NamedTemporaryFile(mode="w+", dir=".")
    json.dump(pfb_schema_json, tf)
    tf.seek(0)
    # load the customized schema
    pfb_schema = load_schema(tf.name)
    pass

# metadata for PFB message:
# note that unused fields still must be defined, since in the schema
# null is not allowed (which seems more like oversight than intention)

icdc_cohort_meta = {
    "name":
    "icdc.cohort",
    "ontology_reference":
    "",
    "values": {},
    "links": [],
    "properties": [{
        "name": "cohort_description",
def typed_dict_from_schema_file(schema_path):
    schema = expand_schema(load_schema(schema_path))

    return types_for_schema(schema)
Ejemplo n.º 8
0
                'triple_norm': [sbj_norm, rel_norm, obj_norm],
                'true_link': {
                    'subject': sbj[0]['w_link']['wiki_link'],
                    'object': obj[0]['w_link']['wiki_link']
                },
                'src_sentences': [src_sentence]
            }
            dataset.append(triple_record)
    return dataset


def build_datasets(args):
    i, file_name = args
    return read_avro(file_name)


AVRO_SCHEMA_FILE = './avroschema/WikiArticleLinkedNLP.avsc'
AVRO_FOLDER = './data/OPIEC-Linked-triples/'
OUTPUT_FOLDER = './data/opiec-for-canonical/'

avro_folder = Path(AVRO_FOLDER)
AVRO_FILES = avro_folder.glob('*.avro')

schema = load_schema(AVRO_SCHEMA_FILE)

with Pool(10) as p:
    datasets = p.map(build_datasets, enumerate(AVRO_FILES))
    dataset = sum(datasets, [])
    with open(f'{OUTPUT_FOLDER}/triples.json', 'w') as f:
        json.dump(dataset, f)