def test_fixed_named_type(): """https://github.com/fastavro/fastavro/issues/450""" schema = { "type": "record", "name": "test_fixed_named_type", "fields": [ { "name": "test1", "type": { "type": "fixed", "name": "my_fixed", "size": 4, }, }, { "name": "test2", "type": "my_fixed", }, ], } record = {"test1": b"1234", "test2": b"4321"} parsed_schema = parse_schema(schema) validate(record, parsed_schema)
def test_no_data(tmpdir): filepath = tmpdir + "no_data.avro" schema = { "name": "Weather", "type": "record", "fields": [ { "name": "station", "type": "string" }, { "name": "time", "type": "long" }, { "name": "temp", "type": "int" }, ], } parsed_schema = fa.parse_schema(schema) with open(filepath, "wb") as out: fa.writer(out, parsed_schema, []) df = cudf.read_avro(filepath) # fastavro returns an empty dataframe, need to verify manually assert_eq(df.shape, (0, 3)) dtypes = df.dtypes.values.tolist() assert_eq(dtypes, [np.dtype("O"), np.dtype("int64"), np.dtype("int32")]) col_names = df.columns.tolist() assert_eq(col_names, ["station", "time", "temp"])
class ClickEvent: email: str = field(default_factory=faker.email) timestamp: str = field(default_factory=faker.iso8601) uri: str = field(default_factory=faker.uri) number: int = field(default_factory=lambda: random.randint(0, 999)) attributes: dict = field(default_factory=ClickAttribute.attributes) schema = parse_schema({ "type": "record", "name": "click_event", "namespace": "com.udacity.lesson3.exercise2", "fields": [ { "name": "email", "type": "string" }, { "name": "timestamp", "type": "string" }, { "name": "uri", "type": "string" }, { "name": "number", "type": "int" }, # # TODO: Add the attributes map! { "name": "attributes", "type": { "type": "map", "values": { "type": "record", "name": "attribute", "fields": [{ "name": "element", "type": "string" }, { "name": "content", "type": "string" }] } } } ] }) def serialize(self): """Serializes the ClickEvent for sending to Kafka""" out = BytesIO() writer(out, ClickEvent.schema, [asdict(self)]) return out.getvalue()
def test_enum_named_type(): """https://github.com/fastavro/fastavro/issues/450""" schema = { "type": "record", "name": "test_enum_named_type", "fields": [ { "name": "test1", "type": { "type": "enum", "name": "my_enum", "symbols": ["FOO", "BAR"], }, }, { "name": "test2", "type": "my_enum", }, ], } record = {"test1": "FOO", "test2": "BAR"} parsed_schema = parse_schema(schema) validate(record, parsed_schema)
def write_avro(): schema = { "namespace": "sample.avro", "type": "record", "name": "Cars", "fields": [ {"name": "model", "type": "string"}, {"name": "make", "type": ["string", "null"]}, {"name": "year", "type": ["int", "null"]} ] } records = [ {"model": "ABC", "make": "Audi", "year": 2010}, {"model": "ABC", "make": "Audi", "year": 2010}, {"model": "ABC", "make": "Audi", "year": 2010}, {"model": "ABC", "make": "Audi", "year": 2010}, {"model": "ABC", "make": "Audi", "year": 2010}, {"model": "EDF", "make": "BEWM"}, {"model": "EDF", "make": "BEWM"}, {"model": "EDF", "make": "BEWM"}, {"model": "EDF", "make": "BEWM"}, {"model": "EDF", "make": "BEWM"}, {"model": "XCV", "year": 2010}, {"model": "XCV", "year": 2010}, {"model": "XCV", "year": 2010}, {"model": "XCV", "year": 2010}, {"model": "XCV", "year": 2010} ] os.makedirs(os.path.join('.', 'data'), exist_ok=True) with open(os.path.join('.', 'data', 'cars.avro'), 'wb') as avro_file: writer(avro_file, parse_schema(schema), records) with open(os.path.join('.', 'data', 'cars.json'), 'w') as json_file: json.dump(records, json_file)
def encode_into_avro(alert: dict, schema_file: str) -> str: """Encode a dict record into avro bytes Parameters ---------- alert: dict A Dictionary of alert data schema_file: str Path of avro schema file Returns ---------- value: str a bytes string with avro encoded alert data Examples ---------- >>> r = AlertReader(avro_single_alert) >>> alert = r.to_list(size=1)[0] >>> avro_encoded = encode_into_avro(alert, schema_path) """ with open(schema_file) as f: schema = json.load(f) parsed_schema = fastavro.parse_schema(schema) b = io.BytesIO() fastavro.schemaless_writer(b, parsed_schema, alert) return b.getvalue()
def load_schema(name, suffix=""): """Load an Avro schema from the local app data. This function is memoized so that repeated calls are fast. Parameters ---------- name : `str` Name of the schema. This should be a fully-qualified name that matches the file name of schemas in the ``schemas/`` directory of the source repository. suffix : `str`, optional A suffix to add to the schema's name. This is typically used to create "staging" schemas, therefore "staging subjects" in the Schema Registry. Returns ------- schema : `dict` A schema object. """ schemas_dir = Path(__file__).parent / "schemas" schema_path = schemas_dir / f"{name}.json" schema = json.loads(schema_path.read_text()) if suffix: schema["name"] = "".join((schema["name"], suffix)) return fastavro.parse_schema(schema)
class ClickEvent: email: str = field(default_factory=faker.email) timestamp: str = field(default_factory=faker.iso8601) uri: str = field(default_factory=faker.uri) number: int = field(default_factory=lambda: random.randint(0, 999)) # # TODO: Define an Avro Schema for this ClickEvent # See: https://avro.apache.org/docs/1.8.2/spec.html#schema_record # See: https://fastavro.readthedocs.io/en/latest/schema.html?highlight=parse_schema#fastavro-schema # schema = parse_schema({ "type" }) def serialize(self): """Serializes the ClickEvent for sending to Kafka""" # # TODO: Rewrite the serializer to send data in Avro format # See: https://fastavro.readthedocs.io/en/latest/schema.html?highlight=parse_schema#fastavro-schema # # HINT: Python dataclasses provide an `asdict` method that can quickly transform this # instance into a dictionary! # See: https://docs.python.org/3/library/dataclasses.html#dataclasses.asdict # # HINT: Use BytesIO for your output buffer. Once you have an output buffer instance, call # `getvalue() to retrieve the data inside the buffer. # See: https://docs.python.org/3/library/io.html?highlight=bytesio#io.BytesIO # return json.dumps( {"uri": self.uri, "timestamp": self.timestamp, "email": self.email} )
def __write(self, data): """Write data into the avro file.""" if not self.__schema: self.__schema = parse_schema( self.__avro_translator.deduce_scheme(data[0])) fast_writer(self.__descriptor, self.__schema, data, self.__codec)
def cudf_from_avro_util(schema, records): schema = [] if schema is None else fastavro.parse_schema(schema) buffer = io.BytesIO() fastavro.writer(buffer, schema, records) buffer.seek(0) return cudf.read_avro(buffer)
def __init__(self, file_, data=None, schema=None, codec: str = 'snappy'): """Create an avro archive. Note: Specification: https://avro.apache.org/docs/1.8.2/spec.html Args: file_ (str, BytesIO, IOBase): the output file_ data (dict, list(dict)): the data to write schema (dict): the avro schema as dictionary Returns: AvroDataFileWriter: the instance of this object """ self.__descriptor = None if isinstance(file_, str): self.__descriptor = open(file_, 'wb') elif isinstance(file_, (BytesIO, IOBase)): self.__descriptor = file_ else: raise Exception("Type '{}' for file_ is not supported...".format( type(file_))) self.__schema = None self.__codec = codec self.__avro_translator = AvroObjectTranslator() if schema: self.__schema = parse_schema(schema) if data is not None: self.append(data)
def _write_inference_result(self, sample_ids, labels, weights, scores, scores_and_offsets, task_index, schema_params, output_dir): """ Write inference results. """ photon_ml_writer = PhotonMLWriter(schema_params=schema_params) output_avro_schema = photon_ml_writer.get_inference_output_avro_schema( self.metadata, self._has_label(schema_params[constants.LABEL]), True, has_weight=self._has_feature(schema_params[constants.SAMPLE_WEIGHT])) parsed_schema = parse_schema(output_avro_schema) records = [] for rec_id, rec_label, rec_weight, rec_score, rec_score_and_offset in \ zip(sample_ids, labels, weights, scores, scores_and_offsets): rec = {schema_params[constants.SAMPLE_ID]: int(rec_id), schema_params[constants.PREDICTION_SCORE]: float(rec_score), schema_params[constants.PREDICTION_SCORE_PER_COORDINATE]: float(rec_score_and_offset) } if self._has_label(schema_params[constants.LABEL]): rec[schema_params[constants.LABEL]] = int(rec_label) if self._has_feature(schema_params[constants.SAMPLE_WEIGHT]): rec[schema_params[constants.SAMPLE_WEIGHT]] = int(rec_weight) records.append(rec) output_file = os.path.join(output_dir, "part-{0:05d}.avro".format(task_index)) error_msg = "worker {} encountered error in writing inference results".format(task_index) with tf1.gfile.GFile(output_file, 'wb') as f: try_write_avro_blocks(f, parsed_schema, records, None, error_msg) logging("Worker {} saved inference result to {}".format(task_index, output_file))
def _write_toavro(table, target, mode, schema, sample, codec='deflate', compression_level=None, **avro_args): if table is None or len(table) <= 0: return # build a schema when not defined by user if not schema: schema, table2 = _build_schema_from_values(table, sample) else: table2 = _fix_missing_headers(table, schema) # fastavro expects a iterator of dicts rows = dicts(table2) if PY3 else _ordered_dict_iterator(table2) with target.open(mode) as target_file: # delay the import of fastavro for not breaking when unused import fastavro parsed_schema = fastavro.parse_schema(schema) # this could raise a error when any value is not of supported tupe fastavro.writer(fo=target_file, schema=parsed_schema, records=rows, codec=codec, codec_compression_level=compression_level, **avro_args)
class ClickEvent: email: str = field(default_factory=faker.email) timestamp: str = field(default_factory=faker.iso8601) uri: str = field(default_factory=faker.uri) number: int = field(default_factory=lambda: random.randint(0, 999)) attributes: dict = field(default_factory=ClickAttribute.attributes) # # TODO: Update this Avro schema to include a map of attributes # See: https://avro.apache.org/docs/1.8.2/spec.html#Maps # schema = parse_schema( { "type": "record", "name": "click_event", "namespace": "com.udacity.lesson3.exercise2", "fields": [ {"name": "email", "type": "string"}, {"name": "timestamp", "type": "string"}, {"name": "uri", "type": "string"}, {"name": "number", "type": "int"} # # TODO: Add the attributes map! # ], } ) def serialize(self): """Serializes the ClickEvent for sending to Kafka""" out = BytesIO() writer(out, ClickEvent.schema, [asdict(self)]) return out.getvalue()
class Purchase: username: str = field(default_factory=faker.user_name) currency: str = field(default_factory=faker.currency_code) amount: int = field(default_factory=lambda: random.randint(100, 200000)) schema = parse_schema({ "type": "record", "name": "purchase", "namespace": "com.udacity.lesson3.sample2", "fields": [ { "name": "username", "type": "string" }, { "name": "currency", "type": "string" }, { "name": "amount", "type": "int" }, ], }) def serialize(self): # # TODO: Modify the following sample to use Avro instead of JSON # out = io.BytesIO() writer(out, Purchase.schema, [asdict(self)]) return out.getvalue()
def read_schema() -> fastavro.schema: with open("avro/schema/schema-SKDB.public.sdcocdmst.json", "r") as file: schema = json.load(file) schema = json.loads(schema["schema"]) schema = fastavro.parse_schema(schema) return schema
def fix_record_for_avro(record, avro_schema): for field in avro_schema['fields']: field_name = field['name'] datatype = field['type'] if isinstance(datatype, dict): # This is a record type definition so we need to recurse a level deeper. record[field_name] = fix_record_for_avro( record[field_name], fastavro.parse_schema(datatype))[0] elif isinstance(datatype, list) and isinstance(datatype[1], dict): logical_type = datatype[1].get('logicalType', None) if logical_type: if logical_type.find('-') > -1: logical_prefix, precision = logical_type.split('-') else: logical_prefix = logical_type precision = None if logical_prefix == 'timestamp': is_micros = (precision == 'micros') record[field_name] = datetime_to_epoch_timestamp( record[field_name], micros=is_micros) elif logical_type == 'date': record[field_name] = date_to_epoch_date(record[field_name]) elif logical_prefix == 'time': is_micros = (precision == 'micros') record[field_name] = time_to_epoch_time(record[field_name], micros=is_micros) return [record]
def gen_schema(): schema = { 'doc': 'Twitter', 'name': 'Twitter', 'namespace': 'twitter', 'type': 'record', 'fields': [ { 'name': 'twitter_name', 'type': 'string' }, { 'name': 'twitter_token', 'type': 'float' }, { 'name': 'uuid', 'type': 'string' }, ], } parsed_schema = parse_schema(schema) return parsed_schema
def test_serde_to_avro(pytestconfig: PytestConfig, json_filename: str) -> None: # In this test, we want to read in from JSON -> MCE object. # Next we serialize from MCE to Avro and then deserialize back to MCE. # Finally, we want to compare the two MCE objects. json_path = pytestconfig.rootpath / json_filename mces = list(iterate_mce_file(str(json_path))) # Serialize to Avro. parsed_schema = fastavro.parse_schema( json.loads(getMetadataChangeEventSchema())) fo = io.BytesIO() out_records = [mce.to_obj(tuples=True) for mce in mces] fastavro.writer(fo, parsed_schema, out_records) # Deserialized from Avro. fo.seek(0) in_records = list(fastavro.reader(fo, return_record_name=True)) in_mces = [ MetadataChangeEventClass.from_obj(record, tuples=True) for record in in_records ] # Check diff assert len(mces) == len(in_mces) for i in range(len(mces)): assert mces[i] == in_mces[i]
class ClickEvent: email: str = field(default_factory=faker.email) timestamp: str = field(default_factory=faker.iso8601) uri: str = field(default_factory=faker.uri) number: int = field(default_factory=lambda: random.randint(0, 999)) schema = parse_schema({ "type": "record", "name": "click_event", "namespace": "com.hcvn.bi", "fields": [{ "name": "email", "type": "string" }, { "name": "timestamp", "type": "string" }, { "name": "uri", "type": "string" }, { "name": "number", "type": "int" }] }) def serialize(self): out = BytesIO() writer(out, ClickEvent.schema, [asdict(self)]) return out.getvalue()
def test_parse_schema_accepts_nested_records_from_arrays(): parsed_schema = fastavro.parse_schema({ "fields": [{ "type": { "items": { "type": "record", "fields": [{ "type": "string", "name": "text" }], "name": "Nested" }, "type": "array", }, "name": "multiple" }, { "type": { "type": "array", "items": "Nested" }, "name": "single" }], "type": "record", "name": "test_parse_schema_accepts_nested_records_from_arrays", }) assert "Nested" == parsed_schema["fields"][1]["type"]["items"]
def _write_inference_result(self, sample_ids, labels, weights, prediction_score, prediction_score_per_coordinate, task_index, schema_params: SchemaParams, output_dir): """ Write inference results. """ output_avro_schema = get_inference_output_avro_schema( self.metadata, True, schema_params, has_weight=self._has_feature(schema_params.weight_column_name)) parsed_schema = parse_schema(output_avro_schema) records = [] for rec_id, rec_label, rec_weight, rec_prediction_score, rec_prediction_score_per_coordinate in \ zip(sample_ids, labels, weights, prediction_score, prediction_score_per_coordinate): rec = {schema_params.uid_column_name: int(rec_id), schema_params.prediction_score_column_name: float(rec_prediction_score), schema_params.prediction_score_per_coordinate_column_name: float(rec_prediction_score_per_coordinate)} if self._has_label(schema_params.label_column_name): rec[schema_params.label_column_name] = int(rec_label) if self._has_feature(schema_params.weight_column_name): rec[schema_params.weight_column_name] = int(rec_weight) records.append(rec) output_file = os.path.join(output_dir, f"part-{task_index:05d}.avro") error_msg = f"worker {task_index} encountered error in writing inference results" with tf1.gfile.GFile(output_file, 'wb') as f: try_write_avro_blocks(f, parsed_schema, records, None, error_msg) logging(f"Worker {task_index} saved inference result to {output_file}")
def test_parse_schema_resolves_references_from_unions(): parsed_schema = fastavro.parse_schema({ "namespace": "com.other", "name": "Outer", "type": "record", "fields": [ { "name": "a", "type": [ "null", { "type": "record", "name": "Inner", "fields": [{ "name": "the_thing", "type": "string" }] } ] }, { "name": "b", # This should resolve to com.example.Inner because of the # `namespace` of the enclosing record. "type": ["null", "Inner"] } ] }) assert "com.other.Inner" == parsed_schema["fields"][1]["type"][1]
def test_order_of_values_in_map(): """https://github.com/fastavro/fastavro/issues/303""" schema = { 'doc': 'A weather reading.', 'name': 'Weather', 'namespace': 'test', 'type': 'record', 'fields': [{ 'name': 'metadata', 'type': { 'type': 'map', 'values': [{ 'type': 'array', 'items': 'string' }, { 'type': 'map', 'values': ['string'] }] } }], } parsed_schema = fastavro.parse_schema(schema) records = [{'metadata': {'map1': {'map2': 'str'}}}] assert records == roundtrip(parsed_schema, records)
def __init__( self, file_path_prefix=None, location=None, schema=parse_schema(KLIO_SCHEMA_OBJ), codec="deflate", file_name_suffix="", num_shards=0, shard_name_template=None, mime_type="application/x-avro", ): file_path = self._get_file_path(file_path_prefix, location) super(KlioWriteToAvro, self).__init__( file_path_prefix=file_path, schema=schema, codec=codec, file_name_suffix=file_name_suffix, num_shards=num_shards, shard_name_template=shard_name_template, mime_type=mime_type, use_fastavro=True, ) self._sink = _KlioFastAvroSink( file_path, schema, codec, file_name_suffix, num_shards, shard_name_template, mime_type, )
def test_parse_schema_accepts_nested_namespaces(): parsed_schema = fastavro.parse_schema({ "namespace": "com.example", "name": "Outer", "type": "record", "fields": [ { "name": "a", "type": { "type": "record", "name": "Inner", "fields": [{ "name": "the_thing", "type": "string" }] } }, { "name": "b", # This should resolve to com.example.Inner because of the # `namespace` of the enclosing record. "type": "Inner" }, { "name": "b", "type": "com.example.Inner" } ] }) assert "com.example.Inner" == parsed_schema["fields"][0]["type"]["name"] assert "com.example.Inner" == parsed_schema["fields"][1]["type"]
def test_newer_versions_of_named_schemas_2(): """https://github.com/fastavro/fastavro/issues/450""" schema = { "name": "Weather", "type": "record", "fields": [ { "name": "place1", "type": { "name": "Location", "type": "record", "fields": [{ "name": "city", "type": "string" }], }, }, { "name": "place2", "type": "Location", }, ], } example_1 = {"place1": {"city": "London"}, "place2": {"city": "Berlin"}} parsed_schema = fastavro.parse_schema(schema) assert example_1 == roundtrip(parsed_schema, example_1)
def main(args): schema_row_size = struct.calcsize(args.format) parsed_schema = fastavro.parse_schema(schema(args.format)) file_path = Path(args.bin_input) file_size = file_path.stat().st_size out_filename = (args.output if args.output else filename_from_args( args, prefix='sima.exported.', postfix='.avro')) if file_size % schema_row_size != 0: msg = 'Format {} does not fit into file {}'.format( args.format, args.bin_input) raise ExportError(msg) with open(str(file_path), 'rb') as binf, open(out_filename, 'wb') as avrf: records = ({ "col_{}".format(i): v for i, v in enumerate(struct.unpack(args.format, piece)) } for piece in read_in_chunks(binf, schema_row_size)) fastavro.writer(avrf, parsed_schema, records, codec='deflate', codec_compression_level=4)
def test_record_named_type(): """https://github.com/fastavro/fastavro/issues/450""" schema = { "type": "record", "name": "test_record_named_type", "fields": [{ "name": "test1", "type": { "type": "record", "name": "my_record", "fields": [{ "name": "field1", "type": "string", }] }, }, { "name": "test2", "type": "my_record", }] } record = {"test1": {"field1": "foo"}, "test2": {"field1": "bar"}} parsed_schema = parse_schema(schema) validate(record, parsed_schema)
def write_garmin_file_object_to_file(gfile, avro_file): parsed_schema = fastavro.parse_schema(GarminFile._avro_schema) js = [gfile.to_dict()] with gzip.open('%s.tmp' % avro_file, 'wb') as f: fastavro.writer(f, parsed_schema, js, validator=True) os.rename('%s.tmp' % avro_file, avro_file) return True
def validater(schema, records, runs=1): times = [] valid = [] schema = parse_schema(schema) for _ in range(runs): start = time.time() valid = validate_many(records, schema) end = time.time() times.append(end - start) print('... {0} runs averaged {1} seconds'.format(runs, (sum(times) / runs))) return valid
def write(schema, records, runs=1): times = [] schema = parse_schema(schema) for _ in range(runs): iostream = BytesIO() start = time.time() writer(iostream, schema, records) end = time.time() times.append(end - start) print('... {0} runs averaged {1} seconds'.format(runs, (sum(times) / runs))) return iostream
def read_schemaless(iostream, schema, num_records, runs=1): times = [] schema = parse_schema(schema) for _ in range(runs): for _ in range(num_records): iostream.seek(0) start = time.time() record = schemaless_reader(iostream, schema) end = time.time() times.append(end - start) print('... {0} runs averaged {1} seconds'.format(runs, (sum(times) / runs))) return records
def _bq_to_avro_blocks(bq_blocks, avro_schema_json): avro_schema = fastavro.parse_schema(avro_schema_json) avro_blocks = [] for block in bq_blocks: blockio = six.BytesIO() for row in block: fastavro.schemaless_writer(blockio, avro_schema, row) response = bigquery_storage_v1beta1.types.ReadRowsResponse() response.avro_rows.row_count = len(block) response.avro_rows.serialized_binary_rows = blockio.getvalue() avro_blocks.append(response) return avro_blocks
def _avro_schema(read_session): """Extract and parse Avro schema from a read session. Args: read_session ( \ ~google.cloud.bigquery_storage_v1beta1.types.ReadSession \ ): The read session associated with this read rows stream. This contains the schema, which is required to parse the data blocks. Returns: Tuple[fastavro.schema, Tuple[str]]: A parsed Avro schema, using :func:`fastavro.schema.parse_schema` and the column names for a read session. """ json_schema = json.loads(read_session.avro_schema.schema) column_names = tuple((field["name"] for field in json_schema["fields"])) return fastavro.parse_schema(json_schema), column_names
SCHEMA = { "namespace": "big_obj", "type": "record", "name": "DirFolders", "fields": [ {"name": "base_folder", "type": { "type": "record", "name": "Folder", "fields": [ {"name": "path", "type": "string"}, {"name": "last_mod", "type": ["null", "string"], "default": "null"}, {"name": "files", "type": ["null", {"type": "array", "items": { "type": "record", "name": "File", "fields": [ {"name": "file_name", "type": "string"}, {"name": "content", "type": "bytes"}, {"name": "last_mod", "type": ["null", "string"], "default": "null"} ] }}], "default": "null"}, {"name": "folders", "type": {"type": "array", "items": "Folder"}} ]}}, {"name": "details", "type": "string"}, ] } avro_schema = fastavro.parse_schema(SCHEMA)