def test_schema_migration_schema_mismatch(): schema = { "type": "record", "fields": [{ "name": "test", "type": "string", }] } new_schema = { "type": "enum", "name": "test", "symbols": ["FOO", "BAR"], } new_file = MemoryIO() records = [{"test": "test"}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) try: list(new_reader) except fastavro._reader.SchemaResolutionError: pass else: assert False
def test_schema_migration_array_failure(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["string", "int"] }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["string", "boolean"] }, }] } new_file = MemoryIO() records = [{"test": [1, 2, 3]}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) list(new_reader)
def test_schema_migration_maps_failure(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": "string" }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": "long" }, }] } new_file = MemoryIO() records = [{"test": {"foo": "a"}}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) list(new_reader)
def write(records): fastavro.writer( fo=self._fo.__enter__(), schema=self._schema, records=records, **self._writer_kwargs )
def roundtrip(record, writer_schema, reader_schema): new_file = MemoryIO() fastavro.writer(new_file, writer_schema, [record]) new_file.seek(0) new_records = list(fastavro.reader(new_file, reader_schema)) return new_records[0]
def check(filename): with open(filename, 'rb') as fo: reader = fastavro.reader(fo) assert hasattr(reader, 'schema'), 'no schema on file' if basename(filename) in NO_DATA: return records = list(reader) assert len(records) > 0, 'no records found' new_file = MemoryIO() fastavro.writer(new_file, reader.schema, records, reader.codec) new_file_bytes = new_file.getvalue() new_file = NoSeekMemoryIO(new_file_bytes) new_reader = fastavro.reader(new_file) assert hasattr(new_reader, 'schema'), "schema wasn't written" assert new_reader.schema == reader.schema assert new_reader.codec == reader.codec new_records = list(new_reader) assert new_records == records # Test schema migration with the same schema new_file = NoSeekMemoryIO(new_file_bytes) schema_migration_reader = fastavro.reader(new_file, reader.schema) assert schema_migration_reader.reader_schema == reader.schema new_records = list(schema_migration_reader) assert new_records == records
def serialize_avro_to_string(schema, content): bytes_writer = BytesIO() fastavro.writer(bytes_writer, schema, content) #encoder = avro.io.BinaryEncoder(bytes_writer) #datum_writer.write(content, encoder) return bytes_writer.getvalue()
def test_schema_migration_maps_with_union_promotion(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": ["string", "int"] }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": ["string", "long"] }, }] } new_file = MemoryIO() records = [{"test": {"foo": 1}}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) new_records = list(new_reader) assert new_records == records
def test_schema_migration_array_with_union_promotion(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["boolean", "long"] }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["string", "float"] }, }] } new_file = MemoryIO() records = [{"test": [1, 2, 3]}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) new_records = list(new_reader) assert new_records == records
def test_schema_migration_array_failure(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["string", "int"] }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "array", "items": ["string", "boolean"] }, }] } new_file = MemoryIO() records = [{"test": [1, 2, 3]}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) try: list(new_reader) except fastavro._reader.SchemaResolutionError: pass else: assert False
def test_str_py3(): letters = ascii_uppercase + digits id_size = 100 seed('str_py3') # Repeatable results def gen_id(): return ''.join(choice(letters) for _ in range(id_size)) keys = ['first', 'second', 'third', 'fourth'] testdata = [{key: gen_id() for key in keys} for _ in range(50)] schema = { "fields": [{'name': key, 'type': 'string'} for key in keys], "namespace": "namespace", "name": "zerobyte", "type": "record" } buf = BytesIO() fastavro.writer(buf, schema, testdata) buf.seek(0, SEEK_SET) for i, rec in enumerate(fastavro.reader(buf), 1): pass size = len(testdata) assert i == size, 'bad number of records' assert rec == testdata[-1], 'bad last record'
def test_schema_migration_maps_failure(): schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": "string" }, }] } new_schema = { "type": "record", "fields": [{ "name": "test", "type": { "type": "map", "values": "long" }, }] } new_file = MemoryIO() records = [{"test": {"foo": "a"}}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) try: list(new_reader) except fastavro._reader.SchemaResolutionError: pass else: assert False
def send_file_avro(): schema, records = get_data() buf = io.BytesIO() fastavro.writer(buf, schema, records) buf.seek(0) return send_file(buf, attachment_filename='ccc.avro', mimetype='application/octet-stream')
def roundtrip(schema, records, new_schema): new_file = MemoryIO() fastavro.writer(new_file, schema, records) new_file.seek(0) reader = fastavro.reader(new_file, new_schema) new_records = list(reader) return new_records
def test_repo_caching_issue(): schema = { "type": "record", "name": "B", "fields": [{ "name": "b", "type": { "type": "record", "name": "C", "fields": [{ "name": "c", "type": "string" }] } }] } new_file = MemoryIO() records = [{"b": {"c": "test"}}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file) new_records = list(new_reader) assert new_records == records other_schema = { "name": "A", "type": "record", "fields": [{ "name": "a", "type": { "type": "record", "name": "B", "fields": [{ "name": "b", "type": { "type": "record", "name": "C", "fields": [{ "name": "c", "type": "int" }] } }] } }, { "name": "aa", "type": "B" }] } new_file = MemoryIO() records = [{"a": {"b": {"c": 1}}, "aa": {"b": {"c": 2}}}] fastavro.writer(new_file, other_schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file) new_records = list(new_reader) assert new_records == records
def test_default_values(): schema = {"type": "record", "fields": [{"name": "default_field", "type": "string", "default": "default_value"}]} new_file = MemoryIO() records = [{}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file) new_records = list(new_reader) assert new_records == [{"default_field": "default_value"}]
def test_metadata(): schema = {"type": "record", "fields": []} new_file = MemoryIO() records = [{}] metadata = {"key": "value"} fastavro.writer(new_file, schema, records, metadata=metadata) new_file.seek(0) new_reader = fastavro.reader(new_file) assert new_reader.metadata["key"] == metadata["key"]
def test_no_default(): io = MemoryIO() schema = { 'type': 'record', 'name': 'test', 'fields': [ {'type': 'boolean', 'name': 'a'} ], } fastavro.writer(io, schema, [{}])
def write(schema, records, runs=1): times = [] schema = parse_schema(schema) for _ in range(runs): iostream = BytesIO() start = time.time() writer(iostream, schema, records) end = time.time() times.append(end - start) print('... {0} runs averaged {1} seconds'.format(runs, (sum(times) / runs))) return iostream
def test_schema_migration_add_default_field(): schema = {"type": "record", "fields": []} new_schema = {"type": "record", "fields": [{"name": "test", "type": "string", "default": "default"}]} new_file = MemoryIO() records = [{}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) new_records = list(new_reader) assert new_records == [{"test": "default"}]
def test_schema_migration_reader_union(): schema = {"type": "record", "fields": [{"name": "test", "type": "int"}]} new_schema = {"type": "record", "fields": [{"name": "test", "type": ["string", "int"]}]} new_file = MemoryIO() records = [{"test": 1}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) new_records = list(new_reader) assert new_records == records
def test_str_py3(): buf = BytesIO() fastavro.writer(buf, schema, testdata) buf.seek(0, SEEK_SET) for i, rec in enumerate(fastavro.iter_avro(buf), 1): pass size = len(testdata) assert i == size, 'bad number of records' assert rec == testdata[-1], 'bad last record'
def make_blocks(num_records=2000, codec='null'): records = make_records(num_records) new_file = MemoryIO() fastavro.writer(new_file, schema, records, codec=codec) new_file.seek(0) block_reader = fastavro.block_reader(new_file, schema) blocks = list(block_reader) new_file.close() return blocks, records
def make_blocks(num_records=2000, codec='null', write_to_disk=False): records = make_records(num_records) new_file = NamedTemporaryFile() if write_to_disk else MemoryIO() fastavro.writer(new_file, schema, records, codec=codec) bytes = new_file.tell() new_file.seek(0) block_reader = fastavro.block_reader(new_file, schema) blocks = list(block_reader) new_file.close() return blocks, records, bytes
def test_py3_union_string_and_bytes(): schema = { "fields": [{'name': 'field', 'type': ['string', 'bytes']}], "namespace": "namespace", "name": "union_string_bytes", "type": "record" } records = [ {'field': u'string'}, {'field': b'bytes'} ] buf = BytesIO() fastavro.writer(buf, schema, records)
def test_schema_migration_enum_failure(): schema = {"type": "enum", "name": "test", "symbols": ["FOO", "BAR"]} new_schema = {"type": "enum", "name": "test", "symbols": ["BAZ", "BAR"]} new_file = MemoryIO() records = ["FOO"] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) try: list(new_reader) except fastavro._reader.SchemaResolutionError: pass else: assert False
def test_schema_migration_enum_failure(): schema = { "type": "enum", "name": "test", "symbols": ["FOO", "BAR"], } new_schema = { "type": "enum", "name": "test", "symbols": ["BAZ", "BAR"], } new_file = MemoryIO() records = ["FOO"] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) list(new_reader)
def fastavro_avro(N): from fastavro import writer import numpy as np INTERVAL=1 t_start = time.time() t0 = time.time()7 nums = np.random.random_integers(0, 100, (N, 4)) print("Generated data ({:.2f})".format(time.time() - t0)) t0 = time.time() data = [dict(zip((col1, col2, col3, col4), item)) for item in nums] print("Transformed data ({:.2f})".format(time.time() - t0)) with open("fast_avro_{}_ints.avro".format(N), "wb") as out: writer(out, python_schema, data) print("Finished ({:.2f})".format(time.time() - t_start))
def test_fastavro_complex_nested(): fo = MemoryIO() with open(join(data_dir, 'complex-nested.avsc')) as f: schema = json.load(f) records = [{ "test_boolean": True, "test_int": 10, "test_long": 20, "test_float": 2.0, "test_double": 2.0, "test_bytes": b'asdf', "test_string": 'qwerty', "second_level": { "test_int2": 100, "test_string2": "asdf", "default_level": { "test_int_def": 1, "test_string_def": "nope", } }, "fixed_int8": 1, "fixed_int16": 2, "fixed_int32": 3, "fixed_int64": 4, "fixed_uint8": 1, "fixed_uint16": 2, "fixed_uint32": 3, "fixed_uint64": 4, "fixed_int8_2": 12, }] fastavro.writer(fo, schema, records, enable_extensions=True) fo.seek(0) new_reader = fastavro.reader(fo, enable_extensions=True) assert new_reader.schema == schema new_records = list(new_reader) assert new_records == records
def test_schema_migration_schema_mismatch(): schema = { "type": "record", "fields": [{ "name": "test", "type": "string", }] } new_schema = { "type": "enum", "name": "test", "symbols": ["FOO", "BAR"], } new_file = MemoryIO() records = [{"test": "test"}] fastavro.writer(new_file, schema, records) new_file.seek(0) new_reader = fastavro.reader(new_file, new_schema) list(new_reader)
def write_avro(): event_generator = (event for _ in range(n)) fastavro.writer(avro_f, avro_schema, event_generator)
schema = json.load(open("plat.avsc")) records = [{ "nom": "饺子", "origine": "北京", "ingredients": ["chou", "porc", "farine"], "prix": 4, "type": "plat" }, { "nom": "方便面", "ingredients": ["piment", "nouilles"], "prix": 1.5, "type": "plat", }, { "nom": "宫保鸡丁", "origine": "四川", "ingredients": ["poulet", "cacahuetes"], "prix": 8, "type": "plat" }, { "nom": "米饭", "ingredients": ["riz"], "prix": 1, "type": "accompagnement" }, { "nom": "冰水", "prix": 0.5, "type": "accompagnement" }] fastavro.writer(open("plats.avro", "wb"), schema, records)
] } nodes = [] tree = ET.parse(open(SOURCE_FILE)) for node in tree.iterfind('node'): nodes.append({ 'id': int(node.get('id')), 'longitude': float(node.get('lon')), 'latitude': float(node.get('lat')), 'username': node.get('user') }) # Dump nodes dictionary in an avro file with open(AVRO_FILE, 'wb') as avro_file: fastavro.writer(avro_file, schema, nodes) # Dump nodes dictionary in an avro file and use snappy compression algorithm with open(AVRO_SNAPPY_FILE, 'wb') as avro_file: fastavro.writer(avro_file, schema, nodes, codec='snappy') # Dump nodes dictionary in an avro file and use Bzip2 compression algorithm with open(AVRO_BZIP2_FILE, 'wb') as avro_file: fastavro.writer(avro_file, schema, nodes, codec='bzip2') # do the same with JSON format (for comparison) with open(JSON_FILE, 'w') as json_file: json.dump([schema, nodes], json_file) # Compare the size of the file formats
def _create_avro_example(test_schema, test_table): parsed_schema = fastavro.parse_schema(test_schema) rows = dicts(test_table) with NamedTemporaryFile(delete=False, mode='wb') as fo: fastavro.writer(fo, parsed_schema, rows) return fo.name
def main(): parser = argparse.ArgumentParser( description='load IgBLAST annotations into an Avro sequence record', formatter_class=argparse.ArgumentDefaultsHelpFormatter) # input files parser.add_argument('parse_label', metavar='label', help='the parse label to use for the parse') parser.add_argument('repertoire_filenames', metavar='repertoire-file', nargs=3, help='the V(D)J repertoire file used in IgBLAST') parser.add_argument('seq_record_filename', metavar='seq_record.avro', help='the Avro file with the sequence records') parser.add_argument( 'igblast_output_filenames', metavar='parse.igblast', nargs='+', help='the output of IgBLAST to parse and attach to the sequence record' ) # options parser.add_argument('--min-v-score', metavar='S', type=float, default=70.0, help='the minimum score for the V-segment') parser.add_argument('--min-j-score', metavar='S', type=float, default=26.0, help='the minimum score for the V-segment') args = parser.parse_args() logging.basicConfig(level=logging.INFO) start_time = time.time() logging.info('calculating V(D)J repertoire lengths') germline_lengths = {} for rep_filename in args.repertoire_filenames: with open(rep_filename, 'rt') as rep_handle: for record in SeqIO.parse(rep_handle, 'fasta'): germline_lengths[record.id] = len(record) logging.info('adding parses to sequence records') with open_compressed(args.seq_record_filename, 'rb') as seq_record_handle: seq_record_reader = fastavro.reader(seq_record_handle) igblast_parse_reader = igblast_chain(args.igblast_output_filenames) annotator = igblast_annotator(germline_lengths, seq_record_reader, igblast_parse_reader, args.parse_label, args.min_v_score, args.min_j_score) fastavro.writer(sys.stdout.buffer, seq_record_reader.writer_schema, annotator, codec='bzip2') elapsed_time = time.time() - start_time logging.info( 'elapsed time %s', time.strftime('%H hours, %M minutes, %S seconds', time.gmtime(elapsed_time)))
def test_unsupported_codec(): schema = { "doc": "A weather reading.", "name": "Weather", "namespace": "test", "type": "record", "fields": [ { "name": "station", "type": "string" }, { "name": "time", "type": "long" }, { "name": "temp", "type": "int" }, ], } records = [ { "station": "011990-99999", "temp": 0, "time": 1433269388 }, { "station": "011990-99999", "temp": 22, "time": 1433270389 }, { "station": "011990-99999", "temp": -11, "time": 1433273379 }, { "station": "012650-99999", "temp": 111, "time": 1433275478 }, ] file = MemoryIO() with pytest.raises(ValueError, match="unrecognized codec"): fastavro.writer(file, schema, records, codec="unsupported") file = MemoryIO() fastavro.writer(file, schema, records, codec="deflate") # Change the avro binary to act as if it were written with a codec called # `unsupported` modified_avro = file.getvalue().replace(b"\x0edeflate", b"\x16unsupported") modified_file = MemoryIO(modified_avro) with pytest.raises(ValueError, match="Unrecognized codec"): list(fastavro.reader(modified_file))
def create_cqi_output(filename): lst = list() # read in one avro file with open(const.get_cqi_input_file_path() + filename, 'rb') as fo: reader = fastavro.reader(fo) for record in reader: lst.append([ record['itemId'], record['productId'], record['categoryCode'], record['originalAttr'], record['normalizedAttr'], record['excludeType'], record['categoryCodeLv1'], record['categoryNameLv1'] ]) # noinspection PyUnresolvedReferences df = pd.DataFrame(lst, columns=[ 'itemId', 'productId', 'categoryCode', 'originalAttr', 'normalizedAttr', 'excludeType', 'categoryCodeLv1', 'categoryNameLv1' ]) lst = None df['originCateCode'] = df['categoryCode'] df['originString'] = df['originalAttr'] df['cleanseString'] = '' df['predCateCode'] = '' df['predCateCode1'] = '' df['predCateCode2'] = '' df['predCateCode3'] = '' df['predCateCode4'] = '' df['predCateCode5'] = '' df['predCateCode6'] = '' df['scoreCateCode1'] = 0.0 df['scoreCateCode2'] = 0.0 df['scoreCateCode3_6'] = 0.0 df['scoreFinal'] = 0.0 df['success'] = 0 # noinspection PyUnresolvedReferences cleansed_prod_df = pd.read_csv( const.get_cleansed_prod_dictionary_file_name(), names=['productId', 'isCleansed'], sep='\t', dtype=[('productId', 'long'), ('isCleansed', 'str')]) # df = pd.merge(df, book_cate_df, on='originCateCode', how='left') # df = pd.merge(df, jikgu_prod_df, on='productId', how='left') # noinspection PyUnresolvedReferences df = pd.merge(df, cleansed_prod_df, on='productId', how='left') for i, row in df.iterrows(): if not df.at[i, 'originString'] or len(df.at[i, 'originString']) == 0: continue pred = predict.predict( model_lv1, model_lv2s, model_lv3s, df.at[i, 'normalizedAttr'], # input already garbage filtered string product_id=df.at[i, 'productId'], item_id=df.at[i, 'itemId'], garbage_filter=False)[0] df.at[i, 'cleanseString'] = pred.get_normalized_input_string() if "OLD" not in str(df.at[i, 'categoryNameLv1']).upper(): if "JIKGU" in df.at[i, 'excludeType']: continue if "BOOK" in df.at[i, 'excludeType']: continue if "DVD" in df.at[i, 'excludeType']: continue if df.at[i, 'isCleansed'] == '1': if len(str(df.at[i, 'excludeType'])) == 0: df.at[i, 'excludeType'] = 'OPERATOR_MODEL' else: df.at[i, 'excludeType'] = str( df.at[i, 'excludeType']) + ',OPERATOR_MODEL' continue if pred.get_predict_error() is True: continue if pred.get_final_score() < 0.25: df.at[i, 'scoreCateCode1'] = pred.get_lv1_score() df.at[i, 'scoreCateCode2'] = pred.get_lv2_score() df.at[i, 'scoreCateCode3_6'] = pred.get_lv3_score() df.at[i, 'scoreFinal'] = pred.get_final_score() continue df.at[i, 'predCateCode'] = pred.get_catecode() df.at[i, 'predCateCode1'] = pred.get_lv1_catecode() df.at[i, 'predCateCode2'] = pred.get_lv2_catecode() df.at[i, 'predCateCode3'] = pred.get_lv3_catecode() df.at[i, 'predCateCode4'] = pred.get_lv4_catecode() df.at[i, 'predCateCode5'] = pred.get_lv5_catecode() df.at[i, 'predCateCode6'] = pred.get_lv6_catecode() df.at[i, 'scoreCateCode1'] = pred.get_lv1_score() df.at[i, 'scoreCateCode2'] = pred.get_lv2_score() df.at[i, 'scoreCateCode3_6'] = pred.get_lv3_score() df.at[i, 'scoreFinal'] = pred.get_final_score() if pred.get_predict_error() is True: df.at[i, 'success'] = 0 else: df.at[i, 'success'] = 1 # write result out to avro file schema = { 'name': 'topLevelRecord', 'type': 'record', 'fields': [{ 'name': 'itemId', 'type': ['long', 'null'] }, { 'name': 'productId', 'type': ['long', 'null'] }, { 'name': 'originCateCode', 'type': ['string', 'null'] }, { 'name': 'originString', 'type': 'string' }, { 'name': 'cleanseString', 'type': 'string' }, { 'name': 'predCateCode', 'type': ['string', 'null'] }, { 'name': 'predCateCode1', 'type': ['string', 'null'] }, { 'name': 'predCateCode2', 'type': ['string', 'null'] }, { 'name': 'predCateCode3', 'type': ['string', 'null'] }, { 'name': 'predCateCode4', 'type': ['string', 'null'] }, { 'name': 'predCateCode5', 'type': ['string', 'null'] }, { 'name': 'predCateCode6', 'type': ['string', 'null'] }, { 'name': 'scoreCateCode1', 'type': ['float', 'null'] }, { 'name': 'scoreCateCode2', 'type': ['float', 'null'] }, { 'name': 'scoreCateCode3_6', 'type': ['float', 'null'] }, { 'name': 'scoreFinal', 'type': ['float', 'null'] }, { 'name': 'excludeType', 'type': 'string' }] } output = df[[ 'itemId', 'productId', 'originCateCode', 'originString', 'cleanseString', 'predCateCode', 'predCateCode1', 'predCateCode2', 'predCateCode3', 'predCateCode4', 'predCateCode5', 'predCateCode6', 'scoreCateCode1', 'scoreCateCode2', 'scoreCateCode3_6', 'scoreFinal', 'excludeType' ]] records = output.to_json(orient='records') records = json.loads(records) with open(const.get_cqi_output_file_path() + filename, 'wb') as out: fastavro.writer(out, schema, records) logger.info("Successfully write " + filename)
}, { 'name': 'value', 'type': 'long' }], 'name': 'AutoGen', 'namespace': 'autogenerated', 'type': 'record' } keys = ("key%s" % s for s in range(10000)) vals = range(10000) data = [{'key': key, 'value': val} for key, val in zip(keys, vals)] f = BytesIO() fastavro.writer(f, schema, data) f.seek(0) avro_bytes = f.read() f.seek(0) av = fastavro.reader(f) header = av._header def test_avro_body(): sync = header['sync'] subset = sync.join(avro_bytes.split(sync)[2:4]) assert subset for b in (avro_bytes, subset): b = b.split(sync, 1)[1]
def write(records): fastavro.writer(fo=self._fo.__enter__(), schema=self._schema, records=records, **self._writer_kwargs)
schema = { 'name': 'TestRecord', 'type': 'record', 'fields': [ { 'name': 'D0', 'type': 'string', 'pinotType': 'DIMENSION' }, { 'name': 'D1', 'type': 'string', 'pinotType': 'DIMENSION' }, { 'name': 'D2', 'type': 'string', 'pinotType': 'DIMENSION' }, { 'name': 'daysSinceEpoch', 'type': 'long', 'pinotType': 'TIME' }, { 'name': 'M0', 'type': 'long', 'pinotType': 'METRIC' }, { 'name': 'M1', 'type': 'double', 'pinotType': 'METRIC' } ] } records = [] for i in xrange(args.num_records): record = { 'D0': str(i % 2), 'D1': str(i % 4), 'D2': str(i % 8), 'daysSinceEpoch': int(i % args.num_time_buckets), 'M0': 1, 'M1': 1.0 } records.append(record) print 'Writing {}'.format(sys.argv[1]) with open(sys.argv[1], 'wb') as out: writer(out, schema, records)
characters = [{ "id": 1, "name": "Martin Riggs" }, { "id": 2, "name": "John Wick" }, { "id": 3, "name": "Ripley" }] # Définition du schéma des données schema = { "type": "record", "namespace": "com.badassmoviecharacters", "name": "Character", "doc": "Seriously badass characters", "fields": [{ "name": "name", "type": "string" }, { "name": "id", "type": "int" }] } # Ouverture d'un fichier binaire en mode écriture with open("characters.avro", 'wb') as avro_file: # Ecriture des données fastavro.writer(avro_file, schema, characters, codec="deflate")
from fastavro import reader, writer, parse_schema import json inpFile = str(input("Enter the avro data file name: ")) inpSchemaFile = str(input("Enter the avro schema file name: ")) outFile = str(input("Enter the avro output file: ")) or "outdata.avro" with open(inpSchemaFile, 'rb') as sc: schema = sc.read() parsed = parse_schema(json.loads(schema)) with open(inpFile, 'rb') as inp: records = [r for r in reader(inp)] records.append(records[-1]) flag = 1 while flag: field = str(input("Which field you want to edit: ")) if '.' in field: pass else: records[-1][field] = int( input("Enter the value for " + field + ": ")) flag = int(input("Press 1 to continue or 0 to halt: ")) print(records[-1]) with open(outFile, 'wb') as out: writer(out, parsed, records)
import sys from xml.etree import ElementTree as ET import fastavro osm_file = sys.argv[1] schema = schema = fastavro.schema.load_schema(sys.argv[2]) output_folder = sys.argv[3] compression_codec = 'null' if len(sys.argv) > 4: compression_codec = sys.argv[4] nodes = [] tree = ET.parse(open(osm_file)) for node in tree.iterfind('node'): nodes.append({ 'id': int(node.get('id')), 'longitude': float(node.get('lon')), 'latitude': float(node.get('lat')), 'username': node.get('user') }) # Dump nodes dictionary in an avro file osm_file_name = osm_file.split('/')[-1] avro_file = output_folder + osm_file_name[:-3] + 'avro' with open(avro_file, 'wb') as af: fastavro.writer(af, schema, nodes, codec=compression_codec)
characters = [{ "id": 1, "name": "Martin Riggs" }, { "id": 2, "name": "John Wick" }, { "id": 3, "name": "Ripley" }] # Définition du schéma des données schema = { "type": "record", "namespace": "com.badassmoviecharacters", "name": "Character", "doc": "Seriously badass characters", "fields": [{ "name": "name", "type": "string" }, { "name": "id", "type": "int" }] } # Ouverture d'un fichier binaire en mode écriture with open("characters.avro", 'wb') as avro_file: # Ecriture des données fastavro.writer(avro_file, schema, characters)
def serialise_event_message(event_type, name): buffer = io.BytesIO() writer(buffer, event_schema, [{'event_type': event_type, 'name': name}]) return buffer.getvalue()
def write_avro_records(id, records): fpath = 'app-{}.avro'.format(id) with open(fpath, 'wb') as out: writer(out, PARSED_SCHEMA, records) return fpath
def write_avro_data_to_file_with_schema(filename, json_schema, records): """Write out large numbers of records with the schema. This makes for easier reading and does not significantly affect space. """ with open(filename, 'wb') as out: fastavro.writer(out, json_schema, records)
import fastavro import json from io import BytesIO schema = fastavro.parse_schema({ "type": "record", "name": "testing", "fields": [{ "name": "email", "type": "string" }] }) data_dict = {} data_dict['email'] = '*****@*****.**' with BytesIO() as buf: fastavro.writer(buf, schema, [data_dict]) buf.seek(0) result = fastavro.reader(buf) print([email for email in result]) # #bio.seek(0) #result = list(fastavro.reader(bio)) #for r in result: # print(r)
def test_xz_works_by_default_on_windows_python3(): schema = { "doc": "A weather reading.", "name": "Weather", "namespace": "test", "type": "record", "fields": [ { "name": "station", "type": "string" }, { "name": "time", "type": "long" }, { "name": "temp", "type": "int" }, ], } records = [ { "station": "011990-99999", "temp": 0, "time": 1433269388 }, { "station": "011990-99999", "temp": 22, "time": 1433270389 }, { "station": "011990-99999", "temp": -11, "time": 1433273379 }, { "station": "012650-99999", "temp": 111, "time": 1433275478 }, ] file = MemoryIO() if sys.version_info >= (3, 0): fastavro.writer(file, schema, records, codec="xz") file.seek(0) out_records = list(fastavro.reader(file)) assert records == out_records else: with pytest.raises( ValueError, match="xz codec is supported but you need to install"): fastavro.writer(file, schema, records, codec="xz")
def test_union_records(): # schema = { 'name': 'test_name', 'namespace': 'test', 'type': 'record', 'fields': [{ 'name': 'val', 'type': [{ 'name': 'a', 'namespace': 'common', 'type': 'record', 'fields': [ { 'name': 'x', 'type': 'int' }, { 'name': 'y', 'type': 'int' }, ], }, { 'name': 'b', 'namespace': 'common', 'type': 'record', 'fields': [ { 'name': 'x', 'type': 'int' }, { 'name': 'y', 'type': 'int' }, { 'name': 'z', 'type': ['null', 'int'] }, ], }] }] } data = [{ 'val': { 'x': 3, 'y': 4, 'z': 5, } }] new_file = MemoryIO() fastavro.writer(new_file, schema, data) new_file.seek(0) new_reader = fastavro.reader(new_file) new_records = list(new_reader) assert new_records == data
def serialize(schema, *records): buffer = BytesIO() fastavro.writer(buffer, schema, records) serialized = buffer.getvalue() return serialized
cohort_data_entity = { "name": "icdc.cohort", "id": "n201", "object": cohort_data, "relations": [] } assert validate(("pfb.Entity", cohort_data_entity), pfb_schema) payload = [{ "name": "Metadata", "object": { "name": "pfb.Metadata", "misc": {}, "nodes": [icdc_cohort_meta, icdc_case_meta] } }, cohort_data_entity, case_data_entity] # Create PFB message with open("worked-example.avro", "wb") as out: fastavro.writer(out, pfb_schema, payload) # Read records from message with open("worked-example.avro", "rb") as inf: rdr = fastavro.reader(inf) for rec in rdr: print(rec)
def _write_avro_part(part, f, schema, codec, sync_interval, metadata): """Create single avro file from list of dictionaries""" import fastavro with f as f: fastavro.writer(f, schema, part, codec, sync_interval, metadata)
def serialize(self): """Serializes the ClickEvent for sending to Kafka""" out = BytesIO() writer(out, ClickEvent.schema, [asdict(self)]) return out.getvalue()
"type": "array", "items": "float" }) def generate(): start = stop = 0 while start < len(array): stop = min(stop + events_per_basket, len(array)) chunk = json.loads(ak.to_json(array[start:stop])) for x in chunk: yield x print(int(round(100 * stop / len(array))), "percent", time.asctime(time.localtime())) start = stop for level in [9, 1]: # 9, 1, 0: print("level", level) with open( "/home/jpivarski/storage/data/chep-2021-jagged-jagged-jagged/lzfour" + str(level) + "-jagged1.avro", "wb") as out: fastavro.writer( out, schema, generate(), codec="lz4", # "deflate", codec_compression_level=level, sync_interval=45633959, )
def put_avro(self, schema, records, blob_name, codec='snappy'): path = self._get_path_and_create_dir(blob_name) with open(path, "wb") as f: fastavro.writer(f, schema, records, codec) size = os.path.getsize(path) return Blob(blob_name, size)