def _read_block_header(self): self._block_count = self.raw_decoder.read_long() if self.codec == "null": # Skip a long; we don't need to use the length. self.raw_decoder.skip_long() self._datum_decoder = self._raw_decoder elif self.codec == 'deflate': # Compressed data is stored as (length, data), which # corresponds to how the "bytes" type is encoded. data = self.raw_decoder.read_bytes() # -15 is the log of the window size; negative indicates # "raw" (no zlib headers) decompression. See zlib.h. uncompressed = zlib.decompress(data, -15) self._datum_decoder = avro_io.BinaryDecoder( io.BytesIO(uncompressed)) elif self.codec == 'snappy': # Compressed data includes a 4-byte CRC32 checksum length = self.raw_decoder.read_long() data = self.raw_decoder.read(length - 4) uncompressed = snappy.decompress(data) self._datum_decoder = avro_io.BinaryDecoder( io.BytesIO(uncompressed)) self.raw_decoder.check_crc32(uncompressed) else: raise DataFileException("Unknown codec: %r" % self.codec)
def _read_block_header(self): self.block_count = self.raw_decoder.read_long() if self.codec == "null": # Skip a long; we don't need to use the length. self.raw_decoder.skip_long() self._datum_decoder = self._raw_decoder elif self.codec == 'deflate': # Compressed data is stored as (length, data), which # corresponds to how the "bytes" type is encoded. data = self.raw_decoder.read_bytes() # -15 is the log of the window size; negative indicates # "raw" (no zlib headers) decompression. See zlib.h. uncompressed = zlib.decompress(data, -15) self._datum_decoder = io.BinaryDecoder(StringIO(uncompressed)) elif self.codec == 'snappy': # Compressed data includes a 4-byte CRC32 checksum length = self.raw_decoder.read_long() data = self.raw_decoder.read(length - 4) uncompressed = snappy.decompress(data) self._datum_decoder = io.BinaryDecoder(StringIO(uncompressed)) self.raw_decoder.check_crc32(uncompressed) elif self.codec == 'zstandard': length = self.raw_decoder.read_long() data = self.raw_decoder.read(length) uncompressed = bytearray() dctx = zstd.ZstdDecompressor() with dctx.stream_reader(StringIO(data)) as reader: while True: chunk = reader.read(16384) if not chunk: break uncompressed.extend(chunk) self._datum_decoder = io.BinaryDecoder(StringIO(uncompressed)) else: raise DataFileException("Unknown codec: %r" % self.codec)
def check_skip_number(number_type): logging.debug('Testing skip number for %s', number_type) correct = 0 for value_to_skip, hex_encoding in BINARY_ENCODINGS: VALUE_TO_READ = 6253 logging.debug('Value to Skip: %d', value_to_skip) # write the value to skip and a known value writer_schema = schema.parse('"%s"' % number_type.lower()) writer, encoder, datum_writer = write_datum(value_to_skip, writer_schema) datum_writer.write(VALUE_TO_READ, encoder) # skip the value reader = io.BytesIO(writer.getvalue()) decoder = avro_io.BinaryDecoder(reader) decoder.skip_long() # read data from string buffer datum_reader = avro_io.DatumReader(writer_schema) read_value = datum_reader.read(decoder) logging.debug('Read Value: %d', read_value) if read_value == VALUE_TO_READ: correct += 1 return correct
def read_meta_data_from_file(f): """Reads metadata from a given Avro file. Args: f: Avro file to read. Returns: a tuple containing the codec, schema, and the sync marker of the Avro file. Raises: ValueError: if the file does not start with the byte sequence defined in the specification. """ if f.tell() > 0: f.seek(0) decoder = avroio.BinaryDecoder(f) header = avroio.DatumReader().read_data(datafile.META_SCHEMA, datafile.META_SCHEMA, decoder) if header.get('magic') != datafile.MAGIC: raise ValueError('Not an Avro file. File header should start with %s but' 'started with %s instead.', datafile.MAGIC, header.get('magic')) meta = header['meta'] if datafile.CODEC_KEY in meta: codec = meta[datafile.CODEC_KEY] else: codec = 'null' schema_string = meta[datafile.SCHEMA_KEY] sync_marker = header['sync'] return codec, schema_string, sync_marker
def read_block_from_file(f, codec, schema, expected_sync_marker): """Reads a block from a given Avro file. Args: f: Avro file to read. Returns: A single _AvroBlock. Raises: ValueError: If the block cannot be read properly because the file doesn't match the specification. """ offset = f.tell() decoder = avroio.BinaryDecoder(f) num_records = decoder.read_long() block_size = decoder.read_long() block_bytes = decoder.read(block_size) sync_marker = decoder.read(len(expected_sync_marker)) if sync_marker != expected_sync_marker: raise ValueError( 'Unexpected sync marker (actual "%s" vs expected "%s"). ' 'Maybe the underlying avro file is corrupted?', sync_marker, expected_sync_marker) size = f.tell() - offset return _AvroBlock(block_bytes, num_records, codec, schema, offset, size)
def check_skip_number(number_type): print_test_name('TEST SKIP %s' % number_type.upper()) correct = 0 for value_to_skip, hex_encoding in BINARY_ENCODINGS: VALUE_TO_READ = 6253 print 'Value to Skip: %d' % value_to_skip # write the value to skip and a known value writers_schema = schema.parse('"%s"' % number_type.lower()) writer, encoder, datum_writer = write_datum(value_to_skip, writers_schema) datum_writer.write(VALUE_TO_READ, encoder) # skip the value reader = StringIO(writer.getvalue()) decoder = io.BinaryDecoder(reader) decoder.skip_long() # read data from string buffer datum_reader = io.DatumReader(writers_schema) read_value = datum_reader.read(decoder) print 'Read Value: %d' % read_value if read_value == VALUE_TO_READ: correct += 1 print '' return correct
def read_block_from_file(f, codec, schema, expected_sync_marker): """Reads a block from a given Avro file. Args: f: Avro file to read. codec: The codec to use for block-level decompression. Supported codecs: 'null', 'deflate', 'snappy' schema: Avro Schema definition represented as JSON string. expected_sync_marker: Avro synchronization marker. If the block's sync marker does not match with this parameter then ValueError is thrown. Returns: A single _AvroBlock. Raises: ValueError: If the block cannot be read properly because the file doesn't match the specification. """ offset = f.tell() decoder = avroio.BinaryDecoder(f) num_records = decoder.read_long() block_size = decoder.read_long() block_bytes = decoder.read(block_size) sync_marker = decoder.read(len(expected_sync_marker)) if sync_marker != expected_sync_marker: raise ValueError( 'Unexpected sync marker (actual "%s" vs expected "%s"). ' 'Maybe the underlying avro file is corrupted?', sync_marker, expected_sync_marker) size = f.tell() - offset return _AvroBlock(block_bytes, num_records, codec, schema, offset, size)
def _decompress_bytes(data, codec): if codec == 'null': return data elif codec == 'deflate': # zlib.MAX_WBITS is the window size. '-' sign indicates that this is # raw data (without headers). See zlib and Avro documentations for more # details. return zlib.decompress(data, -zlib.MAX_WBITS) elif codec == 'snappy': # Snappy is an optional avro codec. # See Snappy and Avro documentation for more details. try: import snappy except ImportError: raise ValueError('Snappy does not seem to be installed.') # Compressed data includes a 4-byte CRC32 checksum which we verify. # We take care to avoid extra copies of data while slicing large objects # by use of a buffer. result = snappy.decompress(buffer(data)[:-4]) avroio.BinaryDecoder(cStringIO.StringIO( data[-4:])).check_crc32(result) return result else: raise ValueError('Unknown codec: %r', codec)
def __init__(self, reader, datum_reader): """Initializes a new data file reader. Args: reader: Open file to read from. datum_reader: Avro datum reader. """ self._reader = reader self._raw_decoder = avro_io.BinaryDecoder(reader) self._datum_decoder = None # Maybe reset at every block. self._datum_reader = datum_reader # read the header: magic, meta, sync self._read_header() # ensure codec is valid avro_codec_raw = self.GetMeta('avro.codec') if avro_codec_raw is None: self.codec = "null" else: self.codec = avro_codec_raw.decode('utf-8') if self.codec not in VALID_CODECS: raise DataFileException('Unknown codec: %s.' % self.codec) self._file_length = self._GetInputFileLength() # get ready to read self._block_count = 0 self.datum_reader.writer_schema = (schema.Parse( self.GetMeta(SCHEMA_KEY).decode('utf-8')))
def deserialize(x): schema_path = "data/files/fb_scheam.avsc" schema1 = schema.Parse(open(schema_path).read()) bytes_reader = io2.BytesIO(x) decoder = io.BinaryDecoder(bytes_reader) reader = io.DatumReader(schema1) message = reader.read(decoder) return message
def _process_handshake(self, call_response, message_name, request_datum): # process the handshake and call response buffer_decoder = io.BinaryDecoder(StringIO(call_response)) call_response_exists = self.read_handshake_response(buffer_decoder) if call_response_exists: return self.read_call_response(message_name, buffer_decoder) else: return self.request(message_name, request_datum)
def load_report(data): input = StringIO(data) dreader = io.DatumReader(writers_schema=REPORT_SCHEMA, readers_schema=REPORT_SCHEMA) v = dreader.read(io.BinaryDecoder(input)) map(stringify_uuids, v.itervalues()) remove_none(v) print v return v
def issue_request(self, call_request, message_name, request_datum): call_response = self.transceiver.transceive(call_request) # process the handshake and call response buffer_decoder = io.BinaryDecoder(StringIO(call_response)) call_response_exists = self.read_handshake_response(buffer_decoder) if call_response_exists: return self.read_call_response(message_name, buffer_decoder) else: return self.request(message_name, request_datum)
def _IssueRequest(self, call_request, message_name, request_datum): call_response = self.transceiver.Transceive(call_request) # process the handshake and call response buffer_decoder = avro_io.BinaryDecoder(io.BytesIO(call_response)) call_response_exists = self._ReadHandshakeResponse(buffer_decoder) if call_response_exists: return self._ReadCallResponse(message_name, buffer_decoder) else: return self.Request(message_name, request_datum)
def consumer2(): consumer = KafkaConsumer('test') schema_path = "data/files/fb_scheam.avsc" schema1 = schema.Parse(open(schema_path).read()) for msg in consumer: bytes_reader = io2.BytesIO(msg.value) decoder = io.BinaryDecoder(bytes_reader) reader = io.DatumReader(schema1) message = reader.read(decoder) return (message)
def records(self): decoder = avroio.BinaryDecoder( cStringIO.StringIO(self._decompressed_block_bytes)) reader = avroio.DatumReader( writers_schema=self._schema, readers_schema=self._schema) current_record = 0 while current_record < self._num_records: yield reader.read(decoder) current_record += 1
def callback(ch, method, properties, body): start_time = time.clock() bytes_reader = BytesIO(body) decoder = avro_io.BinaryDecoder(bytes_reader) reader = avro_io.DatumReader( schema.Parse(open(f"schemas/{exchange}.avsc", "rb").read())) event_body = reader.read(decoder) time.sleep(0.1) # Mock feature computing time print(f"Event received:" f"size: {sys.getsizeof(event_body)} bytes," f"time: {time.clock() - start_time} secs")
def testNoDefaultValue(self): writer_schema = LONG_RECORD_SCHEMA datum_to_write = LONG_RECORD_DATUM reader_schema = schema.Parse("""\ {"type": "record", "name": "Test", "fields": [{"name": "H", "type": "int"}]}""") writer, encoder, datum_writer = write_datum(datum_to_write, writer_schema) reader = io.BytesIO(writer.getvalue()) decoder = avro_io.BinaryDecoder(reader) datum_reader = avro_io.DatumReader(writer_schema, reader_schema) self.assertRaises(avro_io.SchemaResolutionException, datum_reader.read, decoder)
def _read_block_header(self): self.block_count = self.raw_decoder.read_long() if self.codec == "null": # Skip a long; we don't need to use the length. self.raw_decoder.skip_long() self._datum_decoder = self._raw_decoder else: # Compressed data is stored as (length, data), which # corresponds to how the "bytes" type is encoded. data = self.raw_decoder.read_bytes() # -15 is the log of the window size; negative indicates # "raw" (no zlib headers) decompression. See zlib.h. uncompressed = zlib.decompress(data, -15) self._datum_decoder = io.BinaryDecoder(StringIO(uncompressed))
def test_no_default_value(self): print_test_name('TEST NO DEFAULT VALUE') writers_schema = LONG_RECORD_SCHEMA datum_to_write = LONG_RECORD_DATUM readers_schema = schema.parse("""\ {"type": "record", "name": "Test", "fields": [{"name": "H", "type": "int"}]}""") writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) reader = StringIO(writer.getvalue()) decoder = io.BinaryDecoder(reader) datum_reader = io.DatumReader(writers_schema, readers_schema) self.assertRaises(io.SchemaResolutionException, datum_reader.read, decoder)
def testUnknownSymbol(self): writer_schema = schema.Parse("""\ {"type": "enum", "name": "Test", "symbols": ["FOO", "BAR"]}""") datum_to_write = 'FOO' reader_schema = schema.Parse("""\ {"type": "enum", "name": "Test", "symbols": ["BAR", "BAZ"]}""") writer, encoder, datum_writer = write_datum(datum_to_write, writer_schema) reader = io.BytesIO(writer.getvalue()) decoder = avro_io.BinaryDecoder(reader) datum_reader = avro_io.DatumReader(writer_schema, reader_schema) self.assertRaises(avro_io.SchemaResolutionException, datum_reader.read, decoder)
def decode(self, data, *, type_identifier: int = None, **kwargs): # pylint: disable=arguments-differ """ Decode *data* from :class:`bytes` to the original data structure. :param data: a bytes object containing a serialized message. :param type_identifier: An integer specifying the identity of a registered Avro schema. If specified the schema name is used to lookup the schema in a schema registry. :returns: A Python object. """ avroSchema = self.registry.get_schema_by_id(type_identifier) bytes_reader = io.BytesIO(data) decoder = avro_io.BinaryDecoder(bytes_reader) datum_reader = avro_io.DatumReader(avroSchema) return datum_reader.read(decoder)
def test_unknown_symbol(self): print_test_name('TEST UNKNOWN SYMBOL') writers_schema = schema.parse("""\ {"type": "enum", "name": "Test", "symbols": ["FOO", "BAR"]}""") datum_to_write = 'FOO' readers_schema = schema.parse("""\ {"type": "enum", "name": "Test", "symbols": ["BAR", "BAZ"]}""") writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) reader = StringIO(writer.getvalue()) decoder = io.BinaryDecoder(reader) datum_reader = io.DatumReader(writers_schema, readers_schema) self.assertRaises(io.SchemaResolutionException, datum_reader.read, decoder)
def respond(self, call_request): """ Called by a server to deserialize a request, compute and serialize a response or error. Compare to 'handle()' in Thrift. """ buffer_reader = StringIO(call_request) buffer_decoder = io.BinaryDecoder(buffer_reader) buffer_writer = StringIO() buffer_encoder = io.BinaryEncoder(buffer_writer) error = None response_metadata = {} try: remote_protocol = self.process_handshake(buffer_decoder, buffer_encoder) # handshake failure if remote_protocol is None: return buffer_writer.getvalue() # read request using remote protocol request_metadata = META_READER.read(buffer_decoder) remote_message_name = buffer_decoder.read_utf8() # get remote and local request schemas so we can do # schema resolution (one fine day) remote_message = remote_protocol.messages.get(remote_message_name) if remote_message is None: fail_msg = 'Unknown remote message: %s' % remote_message_name raise schema.AvroException(fail_msg) local_message = self.local_protocol.messages.get( remote_message_name) if local_message is None: fail_msg = 'Unknown local message: %s' % remote_message_name raise schema.AvroException(fail_msg) writers_schema = remote_message.request readers_schema = local_message.request request = self.read_request(writers_schema, readers_schema, buffer_decoder) # perform server logic try: response = self.invoke(local_message, request) except AvroRemoteException, e: error = e except Exception, e: error = AvroRemoteException(str(e))
def dump(): f = open('bytes_json', 'rb') data = json.load(f) f.close() data['RecordId']['Timestamp'] = str(data['RecordId']['Timestamp']) for timestamp in data['IMU']: for orientationType in data['IMU'][timestamp]: for dimension in data['IMU'][timestamp][orientationType]: data['IMU'][timestamp][orientationType][dimension] = str( data['IMU'][timestamp][orientationType][dimension]) bytes_writer = io.BytesIO() encoder = avroIo.BinaryEncoder(bytes_writer) start_time = time.time() writer.write(data, encoder) print("encoding time for avro in seconds: %s" % (time.time() - start_time)) bytes = bytes_writer.getvalue() print(len(bytes)) print(type(bytes)) with open('avro_output', 'wb') as f: f.write(bytes) f_out_gzip = open('compressed_bytes_avro', 'wb') compressed_data = zlib.compress(bytes) # bytes f_out_gzip.write(compressed_data) f_out_gzip.close() encoded = base64.b64encode(compressed_data) f = open('compressed_bytes_base64_fromAvro', 'wb') f.write(encoded) f.close() # decoding bytes_reader = io.BytesIO(bytes) decoder = avroIo.BinaryDecoder(bytes_reader) start_time = time.time() original_data = reader.read(decoder) print("decoding time for avro in seconds: %s" % (time.time() - start_time))
def request(self, message_name, request_datum): """ Writes a request message and reads a response or error message. """ # build handshake and call request buffer_writer = StringIO() buffer_encoder = io.BinaryEncoder(buffer_writer) self.write_handshake_request(buffer_encoder) self.write_call_request(message_name, request_datum, buffer_encoder) # send the handshake and call request; block until call response call_request = buffer_writer.getvalue() call_response = self.transceiver.transceive(call_request) # process the handshake and call response buffer_decoder = io.BinaryDecoder(StringIO(call_response)) call_response_exists = self.read_handshake_response(buffer_decoder) if call_response_exists: return self.read_call_response(message_name, buffer_decoder) else: return self.request(message_name, request_datum)
def input(self, data, count): """ Recieve input from the server Parameters ------------------------------------------------------ data - Sould containg the bytes encoding the serialized data - I think this gets represented as a tring count - how many input records are provided in the binary stream """ try: # to avio.BinaryDecoder bdata = StringIO(data) decoder = avio.BinaryDecoder(bdata) for i in range(count): if (self.taskType == TaskType.MAP): inRecord = self.inReader.read(decoder) # Do we need to pass midCollector if its declared as an instance variable self.map(inRecord, self.midCollector) elif (self.taskType == TaskType.REDUCE): # store the previous record prev = self.midRecord # read the new record self.midRecord = self.midReader.read(decoder) if (prev != None and not (keys_are_equal( self.midRecord, prev, self._red_fkeys))): # since the key has changed we need to finalize the processing # for this group of key,value pairs self.reduceFlush(prev, self.outCollector) self.reduce(self.midRecord, self.outCollector) except Exception as e: estr = traceback.format_exc() self.log.warning("failing: " + estr) self.fail(estr)
def __init__(self, reader, datum_reader): self._reader = reader self._raw_decoder = io.BinaryDecoder(reader) self._datum_decoder = None # Maybe reset at every block. self._datum_reader = datum_reader # read the header: magic, meta, sync self._read_header() # ensure codec is valid self.codec = self.get_meta('avro.codec') if self.codec is None: self.codec = "null" if self.codec not in VALID_CODECS: raise DataFileException('Unknown codec: %s.' % self.codec) # get file length self._file_length = self.determine_file_length() # get ready to read self._block_count = 0 self.datum_reader.writers_schema = schema.parse(self.get_meta(SCHEMA_KEY))
def read(self, fp, schema): sch = self.names.get_name('edu.berkeley.cs.local.' + schema, None) dreader = io.DatumReader(writers_schema=sch, readers_schema=sch) return dreader.read(io.BinaryDecoder(fp))
def decoder(p): bin_decoder = io.BinaryDecoder(p) return avro_reader.read(bin_decoder)