def __init__( self, writer, datum_writer, writer_schema=None, codec='null', ): """Constructs a new DataFileWriter instance. If the schema is not present, presume we're appending. Args: writer: File-like object to write into. datum_writer: writer_schema: Schema codec: """ self._writer = writer self._encoder = avro_io.BinaryEncoder(writer) self._datum_writer = datum_writer self._buffer_writer = io.BytesIO() self._buffer_encoder = avro_io.BinaryEncoder(self._buffer_writer) self._block_count = 0 self._meta = {} # Ensure we have a writer that accepts bytes: self._writer.write(b'') # Whether the header has already been written: self._header_written = False if writer_schema is not None: if codec not in VALID_CODECS: raise DataFileException('Unknown codec: %r' % codec) self._sync_marker = DataFileWriter.GenerateSyncMarker() self.SetMeta('avro.codec', codec) self.SetMeta('avro.schema', str(writer_schema).encode('utf-8')) self.datum_writer.writer_schema = writer_schema else: # open writer for reading to collect metadata dfr = DataFileReader(writer, avro_io.DatumReader()) # TODO: collect arbitrary metadata # collect metadata self._sync_marker = dfr.sync_marker self.SetMeta('avro.codec', dfr.GetMeta('avro.codec')) # get schema used to write existing file schema_from_file = dfr.GetMeta('avro.schema').decode('utf-8') self.SetMeta('avro.schema', schema_from_file) self.datum_writer.writer_schema = schema.Parse(schema_from_file) # seek to the end of the file and prepare for writing writer.seek(0, 2) self._header_written = True
def encode_record_with_schema_id(self, schema_id, record): """ Encode a record with a given schema id. The record must be a python dictionary. """ if not isinstance(record, dict): raise SerializerError("record must be a dictionary") # use slow avro if schema_id not in self.id_to_writers: # get the writer + schema try: schema = self.registry_client.get_by_id(schema_id) if not schema: raise SerializerError("Schema does not exist") self.id_to_writers[schema_id] = io.DatumWriter(schema) except ClientError as e: raise SerializerError("Error fetching schema from registry") # get the writer writer = self.id_to_writers[schema_id] with ContextStringIO() as outf: # write the header # magic byte outf.write(struct.pack('b',MAGIC_BYTE)) # write the schema ID in network byte order (big end) outf.write(struct.pack('>I',schema_id)) # write the record to the rest of it # Create an encoder that we'll write to encoder = io.BinaryEncoder(outf) # write the magic byte # write the object in 'obj' as Avro to the fake file... writer.write(record, encoder) return outf.getvalue()
def convert_output_to_avro_record(output): bio = BytesIO() encoder = aio.BinaryEncoder(bio) write_output_to_encoder(encoder, output) return bio.getvalue()
def __init__(self, writer, datum_writer, writers_schema=None, codec='null'): """ If the schema is not present, presume we're appending. @param writer: File-like object to write into. """ self._writer = writer self._encoder = io.BinaryEncoder(writer) self._datum_writer = datum_writer self._buffer_writer = StringIO() self._buffer_encoder = io.BinaryEncoder(self._buffer_writer) self._block_count = 0 self._meta = {} self._header_written = False if writers_schema is not None: if codec not in VALID_CODECS: raise DataFileException("Unknown codec: %r" % codec) self._sync_marker = DataFileWriter.generate_sync_marker() self.set_meta('avro.codec', codec) self.set_meta('avro.schema', str(writers_schema)) self.datum_writer.writers_schema = writers_schema else: # open writer for reading to collect metadata dfr = DataFileReader(writer, io.DatumReader()) # TODO(hammer): collect arbitrary metadata # collect metadata self._sync_marker = dfr.sync_marker self.set_meta('avro.codec', dfr.get_meta('avro.codec')) # get schema used to write existing file schema_from_file = dfr.get_meta('avro.schema') self.set_meta('avro.schema', schema_from_file) self.datum_writer.writers_schema = schema.parse(schema_from_file) # seek to the end of the file and prepare for writing writer.seek(0, 2) self._header_written = True
def serialize(self, items): schema_path = "fb_scheam.avsc" SCHEMA = schema.Parse(open(schema_path).read()) writer = io.DatumWriter(SCHEMA) bytes_writer = io2.BytesIO() encoder = io.BinaryEncoder(bytes_writer) # There must be a better way of writing this item that isn't so long writer.write(get_as_json(items), encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
def encode(self, data, *, type_identifier: int = None, **kwargs): # pylint: disable=arguments-differ """ Encode an object into Avro and return a :class:`bytes` object. :returns: a serialized message as a bytes object. """ avroSchema = self.registry.get_schema_by_id(type_identifier) bytes_writer = io.BytesIO() encoder = avro_io.BinaryEncoder(bytes_writer) datum_writer = avro_io.DatumWriter(avroSchema) datum_writer.write(data, encoder) return bytes_writer.getvalue()
def request(self, message_name, request_datum): """ Writes a request message and reads a response or error message. """ # build handshake and call request buffer_writer = StringIO() buffer_encoder = io.BinaryEncoder(buffer_writer) self.write_handshake_request(buffer_encoder) self.write_call_request(message_name, request_datum, buffer_encoder) # send the handshake and call request; block until call response call_request = buffer_writer.getvalue() return self.issue_request(call_request, message_name, request_datum)
def serialize(items): from avro import schema, io import io as io2 schema_path = "data/files/fb_scheam.avsc" schema = schema.Parse(open(schema_path).read()) writer = io.DatumWriter(schema) bytes_writer = io2.BytesIO() encoder = io.BinaryEncoder(bytes_writer) # There must be a better way of writing this item that isn't so long print(get_as_json(items)) writer.write(get_as_json(items), encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
def dump_report(datum): # have to diddle with some of the values so avro doesn't choke uuids = map(convert_uuids, datum.itervalues()) map(convert_readings, datum.itervalues()) # then just dump it to a string out = StringIO() dwriter = io.DatumWriter(writers_schema=REPORT_SCHEMA) dwriter.write(datum, io.BinaryEncoder(out)) for id, p in zip(uuids, datum.itervalues()): if id: p['uuid'] = id return out.getvalue()
def respond(self, call_request): """ Called by a server to deserialize a request, compute and serialize a response or error. Compare to 'handle()' in Thrift. """ buffer_reader = StringIO(call_request) buffer_decoder = io.BinaryDecoder(buffer_reader) buffer_writer = StringIO() buffer_encoder = io.BinaryEncoder(buffer_writer) error = None response_metadata = {} try: remote_protocol = self.process_handshake(buffer_decoder, buffer_encoder) # handshake failure if remote_protocol is None: return buffer_writer.getvalue() # read request using remote protocol request_metadata = META_READER.read(buffer_decoder) remote_message_name = buffer_decoder.read_utf8() # get remote and local request schemas so we can do # schema resolution (one fine day) remote_message = remote_protocol.messages.get(remote_message_name) if remote_message is None: fail_msg = 'Unknown remote message: %s' % remote_message_name raise schema.AvroException(fail_msg) local_message = self.local_protocol.messages.get( remote_message_name) if local_message is None: fail_msg = 'Unknown local message: %s' % remote_message_name raise schema.AvroException(fail_msg) writers_schema = remote_message.request readers_schema = local_message.request request = self.read_request(writers_schema, readers_schema, buffer_decoder) # perform server logic try: response = self.invoke(local_message, request) except AvroRemoteException, e: error = e except Exception, e: error = AvroRemoteException(str(e))
def Request(self, message_name, request_datum): """Writes a request message and reads a response or error message. Args: message_name: Name of the IPC method. request_datum: IPC request. Returns: The IPC response. """ # build handshake and call request buffer_writer = io.BytesIO() buffer_encoder = avro_io.BinaryEncoder(buffer_writer) self._WriteHandshakeRequest(buffer_encoder) self._WriteCallRequest(message_name, request_datum, buffer_encoder) # send the handshake and call request; block until call response call_request = buffer_writer.getvalue() return self._IssueRequest(call_request, message_name, request_datum)
def dump(): f = open('bytes_json', 'rb') data = json.load(f) f.close() data['RecordId']['Timestamp'] = str(data['RecordId']['Timestamp']) for timestamp in data['IMU']: for orientationType in data['IMU'][timestamp]: for dimension in data['IMU'][timestamp][orientationType]: data['IMU'][timestamp][orientationType][dimension] = str( data['IMU'][timestamp][orientationType][dimension]) bytes_writer = io.BytesIO() encoder = avroIo.BinaryEncoder(bytes_writer) start_time = time.time() writer.write(data, encoder) print("encoding time for avro in seconds: %s" % (time.time() - start_time)) bytes = bytes_writer.getvalue() print(len(bytes)) print(type(bytes)) with open('avro_output', 'wb') as f: f.write(bytes) f_out_gzip = open('compressed_bytes_avro', 'wb') compressed_data = zlib.compress(bytes) # bytes f_out_gzip.write(compressed_data) f_out_gzip.close() encoded = base64.b64encode(compressed_data) f = open('compressed_bytes_base64_fromAvro', 'wb') f.write(encoded) f.close() # decoding bytes_reader = io.BytesIO(bytes) decoder = avroIo.BinaryDecoder(bytes_reader) start_time = time.time() original_data = reader.read(decoder) print("decoding time for avro in seconds: %s" % (time.time() - start_time))
def request(self, message_name, request_datum): """ Writes a request message and reads a response or error message. """ # build handshake and call request buffer_writer = StringIO() buffer_encoder = io.BinaryEncoder(buffer_writer) self.write_handshake_request(buffer_encoder) self.write_call_request(message_name, request_datum, buffer_encoder) # send the handshake and call request; block until call response call_request = buffer_writer.getvalue() call_response = self.transceiver.transceive(call_request) # process the handshake and call response buffer_decoder = io.BinaryDecoder(StringIO(call_response)) call_response_exists = self.read_handshake_response(buffer_decoder) if call_response_exists: return self.read_call_response(message_name, buffer_decoder) else: return self.request(message_name, request_datum)
def encode_record(schema_id, schema, record): #construct avro writer writer = io.DatumWriter(schema) outf = StringIO.StringIO() # write the header # magic byte outf.write(struct.pack('b', MAGIC_BYTE)) # write the schema ID in network byte order (big end) outf.write(struct.pack('>I', schema_id)) # write the record to the rest of it # Create an encoder that we'll write to encoder = io.BinaryEncoder(outf) # write the magic byte # write the object in 'obj' as Avro writer.write(record, encoder) return outf.getvalue()
def send_event(exchange): """Send an event to publish at an input "exchange".""" # Get Avro schema, create serialized raw_bytes version of event body event_schema = schema.Parse(open(f"schemas/{exchange}.avsc", "rb").read()) writer = avro_io.DatumWriter(event_schema) bytes_writer = io.BytesIO() encoder = avro_io.BinaryEncoder(bytes_writer) writer.write(event_bodies[exchange], encoder) raw_bytes = bytes_writer.getvalue() # create connection, declare exchange connection = pika.BlockingConnection( pika.ConnectionParameters(host='localhost')) channel = connection.channel() channel.exchange_declare(exchange=exchange, exchange_type='fanout') # publish message, close connection channel.basic_publish(exchange=exchange, routing_key='', body=raw_bytes) connection.close()
def __init__(self, scheme=None, outputClient=None): """ Parameters --------------------------------------------- scheme - The scheme for the datums to output - can be a json string - or an instance of Schema outputClient - The output client used to send messages to the parent """ if not (isinstance(scheme, schema.Schema)): scheme = schema.parse(scheme) if (outputClient is None): raise ValueError("output client can't be none.") self.scheme = scheme self.buff = StringIO() self.encoder = avio.BinaryEncoder(self.buff) self.datum_writer = avio.DatumWriter(writers_schema=self.scheme) self.outputClient = outputClient
def respond(self, call_request): """Entry point to process one procedure call. Args: call_request: Serialized procedure call request. The call request includes: - handshake prefix; - call request metadata (map: string -> bytes); - protocol message name; - encoded request for the protocol message. Returns: Serialized procedure call response. Raises: ??? """ buffer_reader = io.BytesIO(call_request) buffer_decoder = avro_io.BinaryDecoder(buffer_reader) buffer_writer = io.BytesIO() buffer_encoder = avro_io.BinaryEncoder(buffer_writer) error = None # Map: string -> bytes response_metadata = {} try: remote_protocol = self._process_handshake(buffer_decoder, buffer_encoder) # handshake failure if remote_protocol is None: return buffer_writer.getvalue() # read request using remote protocol request_metadata = META_READER.read(buffer_decoder) remote_message_name = buffer_decoder.read_utf8() # get remote and local request schemas so we can do # schema resolution (one fine day) remote_message = remote_protocol.message_map.get( remote_message_name) if remote_message is None: fail_msg = "Unknown remote message: %s" % remote_message_name raise schema.AvroException(fail_msg) local_message = self.local_protocol.message_map.get( remote_message_name) if local_message is None: fail_msg = "Unknown local message: %s" % remote_message_name raise schema.AvroException(fail_msg) writer_schema = remote_message.request reader_schema = local_message.request request = self._read_request(writer_schema, reader_schema, buffer_decoder) logging.debug("Processing request: %r", request) # perform server logic try: response = self.invoke(local_message, request) except AvroRemoteException as exn: error = exn except Exception as exn: error = AvroRemoteException(str(exn)) # write response using local protocol META_WRITER.write(response_metadata, buffer_encoder) buffer_encoder.write_boolean(error is not None) if error is None: writer_schema = local_message.response self._write_response(writer_schema, response, buffer_encoder) else: writer_schema = local_message.errors self._write_error(writer_schema, error, buffer_encoder) except schema.AvroException as exn: logging.error("Error while encoding response:\n%s", traceback.format_exc()) # FIXME: We may already have written the error flag error = AvroRemoteException(str(exn)) buffer_encoder = avro_io.BinaryEncoder(io.BytesIO()) META_WRITER.write(response_metadata, buffer_encoder) buffer_encoder.write_boolean(True) self._write_error(SYSTEM_ERROR_SCHEMA, error, buffer_encoder) return buffer_writer.getvalue()
def test1(self): from word_count_task import WordCountTask from avro.tether import TaskRunner, find_port, HTTPRequestor, inputProtocol, TaskType from avro import io as avio import mock_tether_parent import subprocess import StringIO import logging # set the logging level to debug so that debug messages are printed logging.basicConfig(level=logging.DEBUG) proc = None try: # launch the server in a separate process env = dict() env["PYTHONPATH"] = ':'.join(sys.path) parent_port = find_port() pyfile = mock_tether_parent.__file__ proc = subprocess.Popen( ["python", pyfile, "start_server", "{0}".format(parent_port)]) input_port = find_port() print "Mock server started process pid={0}".format(proc.pid) # Possible race condition? open tries to connect to the subprocess before the subprocess is fully started # so we give the subprocess time to start up time.sleep(1) runner = TaskRunner(WordCountTask()) runner.start(outputport=parent_port, join=False) # Test sending various messages to the server and ensuring they are # processed correctly requestor = HTTPRequestor("localhost", runner.server.server_address[1], inputProtocol) # TODO: We should validate that open worked by grabbing the STDOUT of the subproces # and ensuring that it outputted the correct message. # Test the mapper requestor.request( "configure", { "taskType": TaskType.MAP, "inSchema": str(runner.task.inschema), "outSchema": str(runner.task.midschema) }) # Serialize some data so we can send it to the input function datum = "This is a line of text" writer = StringIO.StringIO() encoder = avio.BinaryEncoder(writer) datum_writer = avio.DatumWriter(runner.task.inschema) datum_writer.write(datum, encoder) writer.seek(0) data = writer.read() # Call input to simulate calling map requestor.request("input", {"data": data, "count": 1}) #Test the reducer requestor.request( "configure", { "taskType": TaskType.REDUCE, "inSchema": str(runner.task.midschema), "outSchema": str(runner.task.outschema) }) #Serialize some data so we can send it to the input function datum = {"key": "word", "value": 2} writer = StringIO.StringIO() encoder = avio.BinaryEncoder(writer) datum_writer = avio.DatumWriter(runner.task.midschema) datum_writer.write(datum, encoder) writer.seek(0) data = writer.read() #Call input to simulate calling reduce requestor.request("input", {"data": data, "count": 1}) requestor.request("complete", {}) runner.task.ready_for_shutdown.wait() runner.server.shutdown() #time.sleep(2) #runner.server.shutdown() sthread = runner.sthread #Possible race condition? time.sleep(1) #make sure the other thread terminated self.assertFalse(sthread.isAlive()) #shutdown the logging logging.shutdown() except Exception as e: raise finally: #close the process if not (proc is None): proc.kill()
def get_avro_binary(name, email): buf = StringIO.StringIO() encoder = io.BinaryEncoder(buf) writer = io.DatumWriter(schema_obj) writer.write({'name': name, 'email': email}, encoder) return buf.getvalue()
def write(self, fp, datum, schema): sch = self.names.get_name('edu.berkeley.cs.local.' + schema, None) dwriter = io.DatumWriter(writers_schema=sch) dwriter.write(datum, io.BinaryEncoder(fp))
def write_datum(datum, writers_schema): writer = StringIO() encoder = io.BinaryEncoder(writer) datum_writer = io.DatumWriter(writers_schema) datum_writer.write(datum, encoder) return writer, encoder, datum_writer
error = e except Exception, e: error = AvroRemoteException(str(e)) # write response using local protocol META_WRITER.write(response_metadata, buffer_encoder) buffer_encoder.write_boolean(error is not None) if error is None: writers_schema = local_message.response self.write_response(writers_schema, response, buffer_encoder) else: writers_schema = local_message.errors self.write_error(writers_schema, error, buffer_encoder) except schema.AvroException, e: error = AvroRemoteException(str(e)) buffer_encoder = io.BinaryEncoder(StringIO()) META_WRITER.write(response_metadata, buffer_encoder) buffer_encoder.write_boolean(True) self.write_error(SYSTEM_ERROR_SCHEMA, error, buffer_encoder) return buffer_writer.getvalue() def process_handshake(self, decoder, encoder): handshake_request = HANDSHAKE_RESPONDER_READER.read(decoder) handshake_response = {} # determine the remote protocol client_hash = handshake_request.get('clientHash') client_protocol = handshake_request.get('clientProtocol') remote_protocol = self.get_protocol_cache(client_hash) if remote_protocol is None and client_protocol is not None: remote_protocol = protocol.parse(client_protocol)
def write_datum(datum, writer_schema): writer = io.BytesIO() encoder = avro_io.BinaryEncoder(writer) datum_writer = avro_io.DatumWriter(writer_schema) datum_writer.write(datum, encoder) return writer, encoder, datum_writer
def Respond(self, call_request): """Entry point to process one procedure call. Args: call_request: Serialized procedure call request. Returns: Serialized procedure call response. Raises: ??? """ buffer_reader = io.BytesIO(call_request) buffer_decoder = avro_io.BinaryDecoder(buffer_reader) buffer_writer = io.BytesIO() buffer_encoder = avro_io.BinaryEncoder(buffer_writer) error = None response_metadata = {} try: remote_protocol = self._ProcessHandshake(buffer_decoder, buffer_encoder) # handshake failure if remote_protocol is None: return buffer_writer.getvalue() # read request using remote protocol request_metadata = META_READER.read(buffer_decoder) remote_message_name = buffer_decoder.read_utf8() # get remote and local request schemas so we can do # schema resolution (one fine day) remote_message = remote_protocol.message_map.get( remote_message_name) if remote_message is None: fail_msg = 'Unknown remote message: %s' % remote_message_name raise schema.AvroException(fail_msg) local_message = self.local_protocol.message_map.get( remote_message_name) if local_message is None: fail_msg = 'Unknown local message: %s' % remote_message_name raise schema.AvroException(fail_msg) writer_schema = remote_message.request reader_schema = local_message.request request = self._ReadRequest(writer_schema, reader_schema, buffer_decoder) logger.info('Processing request: %r', request) # perform server logic try: response = self.Invoke(local_message, request) except AvroRemoteException as exn: error = exn except Exception as exn: error = AvroRemoteException(str(exn)) # write response using local protocol META_WRITER.write(response_metadata, buffer_encoder) buffer_encoder.write_boolean(error is not None) if error is None: writer_schema = local_message.response self._WriteResponse(writer_schema, response, buffer_encoder) else: writer_schema = local_message.errors self._WriteError(writer_schema, error, buffer_encoder) except schema.AvroException as exn: error = AvroRemoteException(str(exn)) buffer_encoder = avro_io.BinaryEncoder(io.StringIO()) META_WRITER.write(response_metadata, buffer_encoder) buffer_encoder.write_boolean(True) self._WriteError(SYSTEM_ERROR_SCHEMA, error, buffer_encoder) return buffer_writer.getvalue()
def test1(self): """ Test that the thether_task is working. We run the mock_tether_parent in a separate subprocess """ from avro import tether from avro import io as avio from avro import schema from avro.tether import HTTPRequestor, inputProtocol, find_port import StringIO import mock_tether_parent from word_count_task import WordCountTask task = WordCountTask() proc = None try: # launch the server in a separate process # env["AVRO_TETHER_OUTPUT_PORT"]=output_port env = dict() env["PYTHONPATH"] = ':'.join(sys.path) server_port = find_port() pyfile = mock_tether_parent.__file__ proc = subprocess.Popen( ["python", pyfile, "start_server", "{0}".format(server_port)]) input_port = find_port() print "Mock server started process pid={0}".format(proc.pid) # Possible race condition? open tries to connect to the subprocess before the subprocess is fully started # so we give the subprocess time to start up time.sleep(1) task.open(input_port, clientPort=server_port) # TODO: We should validate that open worked by grabbing the STDOUT of the subproces # and ensuring that it outputted the correct message. #*************************************************************** # Test the mapper task.configure(tether.TaskType.MAP, str(task.inschema), str(task.midschema)) # Serialize some data so we can send it to the input function datum = "This is a line of text" writer = StringIO.StringIO() encoder = avio.BinaryEncoder(writer) datum_writer = avio.DatumWriter(task.inschema) datum_writer.write(datum, encoder) writer.seek(0) data = writer.read() # Call input to simulate calling map task.input(data, 1) # Test the reducer task.configure(tether.TaskType.REDUCE, str(task.midschema), str(task.outschema)) # Serialize some data so we can send it to the input function datum = {"key": "word", "value": 2} writer = StringIO.StringIO() encoder = avio.BinaryEncoder(writer) datum_writer = avio.DatumWriter(task.midschema) datum_writer.write(datum, encoder) writer.seek(0) data = writer.read() # Call input to simulate calling reduce task.input(data, 1) task.complete() # try a status task.status("Status message") except Exception as e: raise finally: # close the process if not (proc is None): proc.kill() pass