예제 #1
0
파일: datafile.py 프로젝트: rajeshmr/kiji
 def _read_block_header(self):
     self._block_count = self.raw_decoder.read_long()
     if self.codec == "null":
         # Skip a long; we don't need to use the length.
         self.raw_decoder.skip_long()
         self._datum_decoder = self._raw_decoder
     elif self.codec == 'deflate':
         # Compressed data is stored as (length, data), which
         # corresponds to how the "bytes" type is encoded.
         data = self.raw_decoder.read_bytes()
         # -15 is the log of the window size; negative indicates
         # "raw" (no zlib headers) decompression.  See zlib.h.
         uncompressed = zlib.decompress(data, -15)
         self._datum_decoder = avro_io.BinaryDecoder(
             io.BytesIO(uncompressed))
     elif self.codec == 'snappy':
         # Compressed data includes a 4-byte CRC32 checksum
         length = self.raw_decoder.read_long()
         data = self.raw_decoder.read(length - 4)
         uncompressed = snappy.decompress(data)
         self._datum_decoder = avro_io.BinaryDecoder(
             io.BytesIO(uncompressed))
         self.raw_decoder.check_crc32(uncompressed)
     else:
         raise DataFileException("Unknown codec: %r" % self.codec)
예제 #2
0
 def _read_block_header(self):
     self.block_count = self.raw_decoder.read_long()
     if self.codec == "null":
         # Skip a long; we don't need to use the length.
         self.raw_decoder.skip_long()
         self._datum_decoder = self._raw_decoder
     elif self.codec == 'deflate':
         # Compressed data is stored as (length, data), which
         # corresponds to how the "bytes" type is encoded.
         data = self.raw_decoder.read_bytes()
         # -15 is the log of the window size; negative indicates
         # "raw" (no zlib headers) decompression.  See zlib.h.
         uncompressed = zlib.decompress(data, -15)
         self._datum_decoder = io.BinaryDecoder(StringIO(uncompressed))
     elif self.codec == 'snappy':
         # Compressed data includes a 4-byte CRC32 checksum
         length = self.raw_decoder.read_long()
         data = self.raw_decoder.read(length - 4)
         uncompressed = snappy.decompress(data)
         self._datum_decoder = io.BinaryDecoder(StringIO(uncompressed))
         self.raw_decoder.check_crc32(uncompressed)
     elif self.codec == 'zstandard':
         length = self.raw_decoder.read_long()
         data = self.raw_decoder.read(length)
         uncompressed = bytearray()
         dctx = zstd.ZstdDecompressor()
         with dctx.stream_reader(StringIO(data)) as reader:
             while True:
                 chunk = reader.read(16384)
                 if not chunk:
                     break
                 uncompressed.extend(chunk)
         self._datum_decoder = io.BinaryDecoder(StringIO(uncompressed))
     else:
         raise DataFileException("Unknown codec: %r" % self.codec)
예제 #3
0
def check_skip_number(number_type):
    logging.debug('Testing skip number for %s', number_type)
    correct = 0
    for value_to_skip, hex_encoding in BINARY_ENCODINGS:
        VALUE_TO_READ = 6253
        logging.debug('Value to Skip: %d', value_to_skip)

        # write the value to skip and a known value
        writer_schema = schema.parse('"%s"' % number_type.lower())
        writer, encoder, datum_writer = write_datum(value_to_skip,
                                                    writer_schema)
        datum_writer.write(VALUE_TO_READ, encoder)

        # skip the value
        reader = io.BytesIO(writer.getvalue())
        decoder = avro_io.BinaryDecoder(reader)
        decoder.skip_long()

        # read data from string buffer
        datum_reader = avro_io.DatumReader(writer_schema)
        read_value = datum_reader.read(decoder)

        logging.debug('Read Value: %d', read_value)
        if read_value == VALUE_TO_READ: correct += 1
    return correct
예제 #4
0
  def read_meta_data_from_file(f):
    """Reads metadata from a given Avro file.

    Args:
      f: Avro file to read.
    Returns:
      a tuple containing the codec, schema, and the sync marker of the Avro
      file.

    Raises:
      ValueError: if the file does not start with the byte sequence defined in
                  the specification.
    """
    if f.tell() > 0:
      f.seek(0)
    decoder = avroio.BinaryDecoder(f)
    header = avroio.DatumReader().read_data(datafile.META_SCHEMA,
                                            datafile.META_SCHEMA, decoder)
    if header.get('magic') != datafile.MAGIC:
      raise ValueError('Not an Avro file. File header should start with %s but'
                       'started with %s instead.', datafile.MAGIC,
                       header.get('magic'))

    meta = header['meta']

    if datafile.CODEC_KEY in meta:
      codec = meta[datafile.CODEC_KEY]
    else:
      codec = 'null'

    schema_string = meta[datafile.SCHEMA_KEY]
    sync_marker = header['sync']

    return codec, schema_string, sync_marker
예제 #5
0
    def read_block_from_file(f, codec, schema, expected_sync_marker):
        """Reads a block from a given Avro file.

    Args:
      f: Avro file to read.
    Returns:
      A single _AvroBlock.

    Raises:
      ValueError: If the block cannot be read properly because the file doesn't
        match the specification.
    """
        offset = f.tell()
        decoder = avroio.BinaryDecoder(f)
        num_records = decoder.read_long()
        block_size = decoder.read_long()
        block_bytes = decoder.read(block_size)
        sync_marker = decoder.read(len(expected_sync_marker))
        if sync_marker != expected_sync_marker:
            raise ValueError(
                'Unexpected sync marker (actual "%s" vs expected "%s"). '
                'Maybe the underlying avro file is corrupted?', sync_marker,
                expected_sync_marker)
        size = f.tell() - offset
        return _AvroBlock(block_bytes, num_records, codec, schema, offset,
                          size)
예제 #6
0
파일: test_io.py 프로젝트: zhilinwang/avro
def check_skip_number(number_type):
  print_test_name('TEST SKIP %s' % number_type.upper())
  correct = 0
  for value_to_skip, hex_encoding in BINARY_ENCODINGS:
    VALUE_TO_READ = 6253
    print 'Value to Skip: %d' % value_to_skip

    # write the value to skip and a known value
    writers_schema = schema.parse('"%s"' % number_type.lower())
    writer, encoder, datum_writer = write_datum(value_to_skip, writers_schema)
    datum_writer.write(VALUE_TO_READ, encoder)

    # skip the value
    reader = StringIO(writer.getvalue())
    decoder = io.BinaryDecoder(reader)
    decoder.skip_long()

    # read data from string buffer
    datum_reader = io.DatumReader(writers_schema)
    read_value = datum_reader.read(decoder)

    print 'Read Value: %d' % read_value
    if read_value == VALUE_TO_READ: correct += 1
    print ''
  return correct
예제 #7
0
    def read_block_from_file(f, codec, schema, expected_sync_marker):
        """Reads a block from a given Avro file.

    Args:
      f: Avro file to read.
      codec: The codec to use for block-level decompression.
        Supported codecs: 'null', 'deflate', 'snappy'
      schema: Avro Schema definition represented as JSON string.
      expected_sync_marker: Avro synchronization marker. If the block's sync
        marker does not match with this parameter then ValueError is thrown.
    Returns:
      A single _AvroBlock.

    Raises:
      ValueError: If the block cannot be read properly because the file doesn't
        match the specification.
    """
        offset = f.tell()
        decoder = avroio.BinaryDecoder(f)
        num_records = decoder.read_long()
        block_size = decoder.read_long()
        block_bytes = decoder.read(block_size)
        sync_marker = decoder.read(len(expected_sync_marker))
        if sync_marker != expected_sync_marker:
            raise ValueError(
                'Unexpected sync marker (actual "%s" vs expected "%s"). '
                'Maybe the underlying avro file is corrupted?', sync_marker,
                expected_sync_marker)
        size = f.tell() - offset
        return _AvroBlock(block_bytes, num_records, codec, schema, offset,
                          size)
예제 #8
0
    def _decompress_bytes(data, codec):
        if codec == 'null':
            return data
        elif codec == 'deflate':
            # zlib.MAX_WBITS is the window size. '-' sign indicates that this is
            # raw data (without headers). See zlib and Avro documentations for more
            # details.
            return zlib.decompress(data, -zlib.MAX_WBITS)
        elif codec == 'snappy':
            # Snappy is an optional avro codec.
            # See Snappy and Avro documentation for more details.
            try:
                import snappy
            except ImportError:
                raise ValueError('Snappy does not seem to be installed.')

            # Compressed data includes a 4-byte CRC32 checksum which we verify.
            # We take care to avoid extra copies of data while slicing large objects
            # by use of a buffer.
            result = snappy.decompress(buffer(data)[:-4])
            avroio.BinaryDecoder(cStringIO.StringIO(
                data[-4:])).check_crc32(result)
            return result
        else:
            raise ValueError('Unknown codec: %r', codec)
예제 #9
0
    def __init__(self, reader, datum_reader):
        """Initializes a new data file reader.

    Args:
      reader: Open file to read from.
      datum_reader: Avro datum reader.
    """
        self._reader = reader
        self._raw_decoder = avro_io.BinaryDecoder(reader)
        self._datum_decoder = None  # Maybe reset at every block.
        self._datum_reader = datum_reader

        # read the header: magic, meta, sync
        self._read_header()

        # ensure codec is valid
        avro_codec_raw = self.GetMeta('avro.codec')
        if avro_codec_raw is None:
            self.codec = "null"
        else:
            self.codec = avro_codec_raw.decode('utf-8')
        if self.codec not in VALID_CODECS:
            raise DataFileException('Unknown codec: %s.' % self.codec)

        self._file_length = self._GetInputFileLength()

        # get ready to read
        self._block_count = 0
        self.datum_reader.writer_schema = (schema.Parse(
            self.GetMeta(SCHEMA_KEY).decode('utf-8')))
예제 #10
0
def deserialize(x):
    schema_path = "data/files/fb_scheam.avsc"
    schema1 = schema.Parse(open(schema_path).read())
    bytes_reader = io2.BytesIO(x)
    decoder = io.BinaryDecoder(bytes_reader)
    reader = io.DatumReader(schema1)
    message = reader.read(decoder)
    return message
예제 #11
0
 def _process_handshake(self, call_response, message_name, request_datum):
     # process the handshake and call response
     buffer_decoder = io.BinaryDecoder(StringIO(call_response))
     call_response_exists = self.read_handshake_response(buffer_decoder)
     if call_response_exists:
         return self.read_call_response(message_name, buffer_decoder)
     else:
         return self.request(message_name, request_datum)
예제 #12
0
def load_report(data):
    input = StringIO(data)
    dreader = io.DatumReader(writers_schema=REPORT_SCHEMA, 
                             readers_schema=REPORT_SCHEMA)
    v = dreader.read(io.BinaryDecoder(input))
    map(stringify_uuids, v.itervalues())
    remove_none(v)
    print v
    return v
예제 #13
0
    def issue_request(self, call_request, message_name, request_datum):
        call_response = self.transceiver.transceive(call_request)

        # process the handshake and call response
        buffer_decoder = io.BinaryDecoder(StringIO(call_response))
        call_response_exists = self.read_handshake_response(buffer_decoder)
        if call_response_exists:
            return self.read_call_response(message_name, buffer_decoder)
        else:
            return self.request(message_name, request_datum)
예제 #14
0
    def _IssueRequest(self, call_request, message_name, request_datum):
        call_response = self.transceiver.Transceive(call_request)

        # process the handshake and call response
        buffer_decoder = avro_io.BinaryDecoder(io.BytesIO(call_response))
        call_response_exists = self._ReadHandshakeResponse(buffer_decoder)
        if call_response_exists:
            return self._ReadCallResponse(message_name, buffer_decoder)
        else:
            return self.Request(message_name, request_datum)
예제 #15
0
def consumer2():
    consumer = KafkaConsumer('test')
    schema_path = "data/files/fb_scheam.avsc"
    schema1 = schema.Parse(open(schema_path).read())
    for msg in consumer:
        bytes_reader = io2.BytesIO(msg.value)
        decoder = io.BinaryDecoder(bytes_reader)
        reader = io.DatumReader(schema1)
        message = reader.read(decoder)
        return (message)
예제 #16
0
  def records(self):
    decoder = avroio.BinaryDecoder(
        cStringIO.StringIO(self._decompressed_block_bytes))
    reader = avroio.DatumReader(
        writers_schema=self._schema, readers_schema=self._schema)

    current_record = 0
    while current_record < self._num_records:
      yield reader.read(decoder)
      current_record += 1
예제 #17
0
 def callback(ch, method, properties, body):
     start_time = time.clock()
     bytes_reader = BytesIO(body)
     decoder = avro_io.BinaryDecoder(bytes_reader)
     reader = avro_io.DatumReader(
         schema.Parse(open(f"schemas/{exchange}.avsc", "rb").read()))
     event_body = reader.read(decoder)
     time.sleep(0.1)  # Mock feature computing time
     print(f"Event received:"
           f"size: {sys.getsizeof(event_body)} bytes,"
           f"time: {time.clock() - start_time} secs")
예제 #18
0
  def testNoDefaultValue(self):
    writer_schema = LONG_RECORD_SCHEMA
    datum_to_write = LONG_RECORD_DATUM

    reader_schema = schema.Parse("""\
      {"type": "record", "name": "Test",
       "fields": [{"name": "H", "type": "int"}]}""")

    writer, encoder, datum_writer = write_datum(datum_to_write, writer_schema)
    reader = io.BytesIO(writer.getvalue())
    decoder = avro_io.BinaryDecoder(reader)
    datum_reader = avro_io.DatumReader(writer_schema, reader_schema)
    self.assertRaises(avro_io.SchemaResolutionException, datum_reader.read, decoder)
예제 #19
0
파일: datafile.py 프로젝트: maduhu/HDP-hue
 def _read_block_header(self):
     self.block_count = self.raw_decoder.read_long()
     if self.codec == "null":
         # Skip a long; we don't need to use the length.
         self.raw_decoder.skip_long()
         self._datum_decoder = self._raw_decoder
     else:
         # Compressed data is stored as (length, data), which
         # corresponds to how the "bytes" type is encoded.
         data = self.raw_decoder.read_bytes()
         # -15 is the log of the window size; negative indicates
         # "raw" (no zlib headers) decompression.  See zlib.h.
         uncompressed = zlib.decompress(data, -15)
         self._datum_decoder = io.BinaryDecoder(StringIO(uncompressed))
예제 #20
0
파일: test_io.py 프로젝트: zhilinwang/avro
  def test_no_default_value(self):
    print_test_name('TEST NO DEFAULT VALUE')
    writers_schema = LONG_RECORD_SCHEMA
    datum_to_write = LONG_RECORD_DATUM

    readers_schema = schema.parse("""\
      {"type": "record", "name": "Test",
       "fields": [{"name": "H", "type": "int"}]}""")

    writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema)
    reader = StringIO(writer.getvalue())
    decoder = io.BinaryDecoder(reader)
    datum_reader = io.DatumReader(writers_schema, readers_schema)
    self.assertRaises(io.SchemaResolutionException, datum_reader.read, decoder)
예제 #21
0
  def testUnknownSymbol(self):
    writer_schema = schema.Parse("""\
      {"type": "enum", "name": "Test",
       "symbols": ["FOO", "BAR"]}""")
    datum_to_write = 'FOO'

    reader_schema = schema.Parse("""\
      {"type": "enum", "name": "Test",
       "symbols": ["BAR", "BAZ"]}""")

    writer, encoder, datum_writer = write_datum(datum_to_write, writer_schema)
    reader = io.BytesIO(writer.getvalue())
    decoder = avro_io.BinaryDecoder(reader)
    datum_reader = avro_io.DatumReader(writer_schema, reader_schema)
    self.assertRaises(avro_io.SchemaResolutionException, datum_reader.read, decoder)
예제 #22
0
            def decode(self, data, *, type_identifier: int = None, **kwargs):  # pylint: disable=arguments-differ
                """ Decode *data* from :class:`bytes` to the original data structure.

                :param data: a bytes object containing a serialized message.

                :param type_identifier: An integer specifying the identity of a
                  registered Avro schema. If specified the schema name is used to
                  lookup the schema in a schema registry.

                :returns: A Python object.
                """
                avroSchema = self.registry.get_schema_by_id(type_identifier)
                bytes_reader = io.BytesIO(data)
                decoder = avro_io.BinaryDecoder(bytes_reader)
                datum_reader = avro_io.DatumReader(avroSchema)
                return datum_reader.read(decoder)
예제 #23
0
파일: test_io.py 프로젝트: zhilinwang/avro
  def test_unknown_symbol(self):
    print_test_name('TEST UNKNOWN SYMBOL')
    writers_schema = schema.parse("""\
      {"type": "enum", "name": "Test",
       "symbols": ["FOO", "BAR"]}""")
    datum_to_write = 'FOO'

    readers_schema = schema.parse("""\
      {"type": "enum", "name": "Test",
       "symbols": ["BAR", "BAZ"]}""")

    writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema)
    reader = StringIO(writer.getvalue())
    decoder = io.BinaryDecoder(reader)
    datum_reader = io.DatumReader(writers_schema, readers_schema)
    self.assertRaises(io.SchemaResolutionException, datum_reader.read, decoder)
예제 #24
0
    def respond(self, call_request):
        """
    Called by a server to deserialize a request, compute and serialize
    a response or error. Compare to 'handle()' in Thrift.
    """
        buffer_reader = StringIO(call_request)
        buffer_decoder = io.BinaryDecoder(buffer_reader)
        buffer_writer = StringIO()
        buffer_encoder = io.BinaryEncoder(buffer_writer)
        error = None
        response_metadata = {}

        try:
            remote_protocol = self.process_handshake(buffer_decoder,
                                                     buffer_encoder)
            # handshake failure
            if remote_protocol is None:
                return buffer_writer.getvalue()

            # read request using remote protocol
            request_metadata = META_READER.read(buffer_decoder)
            remote_message_name = buffer_decoder.read_utf8()

            # get remote and local request schemas so we can do
            # schema resolution (one fine day)
            remote_message = remote_protocol.messages.get(remote_message_name)
            if remote_message is None:
                fail_msg = 'Unknown remote message: %s' % remote_message_name
                raise schema.AvroException(fail_msg)
            local_message = self.local_protocol.messages.get(
                remote_message_name)
            if local_message is None:
                fail_msg = 'Unknown local message: %s' % remote_message_name
                raise schema.AvroException(fail_msg)
            writers_schema = remote_message.request
            readers_schema = local_message.request
            request = self.read_request(writers_schema, readers_schema,
                                        buffer_decoder)

            # perform server logic
            try:
                response = self.invoke(local_message, request)
            except AvroRemoteException, e:
                error = e
            except Exception, e:
                error = AvroRemoteException(str(e))
def dump():

    f = open('bytes_json', 'rb')
    data = json.load(f)
    f.close()
    data['RecordId']['Timestamp'] = str(data['RecordId']['Timestamp'])

    for timestamp in data['IMU']:
        for orientationType in data['IMU'][timestamp]:
            for dimension in data['IMU'][timestamp][orientationType]:
                data['IMU'][timestamp][orientationType][dimension] = str(
                    data['IMU'][timestamp][orientationType][dimension])

    bytes_writer = io.BytesIO()
    encoder = avroIo.BinaryEncoder(bytes_writer)
    start_time = time.time()
    writer.write(data, encoder)
    print("encoding time for avro in seconds: %s" % (time.time() - start_time))

    bytes = bytes_writer.getvalue()
    print(len(bytes))
    print(type(bytes))

    with open('avro_output', 'wb') as f:
        f.write(bytes)

    f_out_gzip = open('compressed_bytes_avro', 'wb')
    compressed_data = zlib.compress(bytes)  # bytes
    f_out_gzip.write(compressed_data)
    f_out_gzip.close()

    encoded = base64.b64encode(compressed_data)
    f = open('compressed_bytes_base64_fromAvro', 'wb')
    f.write(encoded)
    f.close()

    # decoding
    bytes_reader = io.BytesIO(bytes)
    decoder = avroIo.BinaryDecoder(bytes_reader)
    start_time = time.time()
    original_data = reader.read(decoder)
    print("decoding time for avro in seconds: %s" % (time.time() - start_time))
예제 #26
0
    def request(self, message_name, request_datum):
        """
    Writes a request message and reads a response or error message.
    """
        # build handshake and call request
        buffer_writer = StringIO()
        buffer_encoder = io.BinaryEncoder(buffer_writer)
        self.write_handshake_request(buffer_encoder)
        self.write_call_request(message_name, request_datum, buffer_encoder)

        # send the handshake and call request; block until call response
        call_request = buffer_writer.getvalue()
        call_response = self.transceiver.transceive(call_request)

        # process the handshake and call response
        buffer_decoder = io.BinaryDecoder(StringIO(call_response))
        call_response_exists = self.read_handshake_response(buffer_decoder)
        if call_response_exists:
            return self.read_call_response(message_name, buffer_decoder)
        else:
            return self.request(message_name, request_datum)
    def input(self, data, count):
        """ Recieve input from the server

    Parameters
    ------------------------------------------------------
    data - Sould containg the bytes encoding the serialized data
          - I think this gets represented as a tring
    count - how many input records are provided in the binary stream
    """
        try:
            # to avio.BinaryDecoder
            bdata = StringIO(data)
            decoder = avio.BinaryDecoder(bdata)

            for i in range(count):
                if (self.taskType == TaskType.MAP):
                    inRecord = self.inReader.read(decoder)

                    # Do we need to pass midCollector if its declared as an instance variable
                    self.map(inRecord, self.midCollector)

                elif (self.taskType == TaskType.REDUCE):

                    # store the previous record
                    prev = self.midRecord

                    # read the new record
                    self.midRecord = self.midReader.read(decoder)
                    if (prev != None and not (keys_are_equal(
                            self.midRecord, prev, self._red_fkeys))):
                        # since the key has changed we need to finalize the processing
                        # for this group of key,value pairs
                        self.reduceFlush(prev, self.outCollector)
                    self.reduce(self.midRecord, self.outCollector)

        except Exception as e:
            estr = traceback.format_exc()
            self.log.warning("failing: " + estr)
            self.fail(estr)
예제 #28
0
  def __init__(self, reader, datum_reader):
    self._reader = reader
    self._raw_decoder = io.BinaryDecoder(reader)
    self._datum_decoder = None # Maybe reset at every block.
    self._datum_reader = datum_reader
    
    # read the header: magic, meta, sync
    self._read_header()

    # ensure codec is valid
    self.codec = self.get_meta('avro.codec')
    if self.codec is None:
      self.codec = "null"
    if self.codec not in VALID_CODECS:
      raise DataFileException('Unknown codec: %s.' % self.codec)

    # get file length
    self._file_length = self.determine_file_length()

    # get ready to read
    self._block_count = 0
    self.datum_reader.writers_schema = schema.parse(self.get_meta(SCHEMA_KEY))
예제 #29
0
파일: test.py 프로젝트: tarunsmalviya/smap
 def read(self, fp, schema):
     sch = self.names.get_name('edu.berkeley.cs.local.' + schema, None)
     dreader = io.DatumReader(writers_schema=sch, readers_schema=sch)
     return dreader.read(io.BinaryDecoder(fp))
예제 #30
0
 def decoder(p):
     bin_decoder = io.BinaryDecoder(p)
     return avro_reader.read(bin_decoder)