Esempio n. 1
0
    def test_with_hearder_reader(self):
        # Note: only when the data stream doesn't have header, we need header stream to help
        file_path = os.path.join(AvroReaderTests._samples_dir_root, 'changeFeed.avro')
        # this data stream has header
        full_data_stream = BytesIO()
        with open(file_path, 'rb') as reader:
            full_data = reader.read()
            full_data_stream.write(full_data)
        # This initialization helps find the position after the first sync_marker
        DataFileReader(full_data_stream, DatumReader())
        position_after_sync_marker = full_data_stream.tell()

        # construct the partial data stream which doesn't have header
        partial_data_stream = _HeaderStream()
        with open(file_path, 'rb') as reader:
            reader.seek(position_after_sync_marker)
            partial_data_stream.write(reader.read())

        header_stream = _HeaderStream()
        with open(file_path, 'rb') as reader:
            header_data = reader.read()
            header_stream.write(header_data)

        df_reader = DataFileReader(partial_data_stream, DatumReader(), header_reader=header_stream)
        records = list(df_reader)
        self.assertEqual(CHANGE_FEED_RECORD, records[0])
        self.assertIsNot(partial_data_stream.event_position, 0)
Esempio n. 2
0
 def test_change_feed(self):
     file_path = os.path.join(AvroReaderTests._samples_dir_root, 'changeFeed.avro')
     with open(file_path, 'rb') as reader:
         datum_reader = DatumReader()
         with DataFileReader(reader, datum_reader) as dfr:
             data = list(dfr)
             self.assertEqual(1, len(data))
             expected_record = CHANGE_FEED_RECORD
             self.assertEqual(expected_record, data[0])
    def _initialize(self, chunk_cursor=None):
        # To get all events in a chunk
        blob_client = self.client.get_blob_client(self.chunk_path)

        file_offset = chunk_cursor.get("BlockOffset") if chunk_cursor else 0

        # An offset means the avro data doesn't have avro header,
        # so only when the data stream has a offset we need header stream to help
        header_stream = ChangeFeedStreamer(blob_client) if file_offset else None
        self._data_stream = ChangeFeedStreamer(blob_client, chunk_file_start=file_offset)
        self.file_reader = DataFileReader(self._data_stream, DatumReader(), header_reader=header_stream)

        event_index = chunk_cursor.get("EventIndex") if chunk_cursor else 0
        for _ in range(0, event_index):
            next(self.file_reader)
Esempio n. 4
0
 def test_reader(self):
     correct = 0
     nitems = 10
     for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
         for codec in CODECS_TO_VALIDATE:
             file_path = os.path.join(AvroReaderTests._samples_dir_root, 'test_' + codec + '_' + str(iexample) + '.avro')
             with open(file_path, 'rb') as reader:
                 datum_reader = DatumReader()
                 with DataFileReader(reader, datum_reader) as dfr:
                     round_trip_data = list(dfr)
                     if ([datum] * nitems) == round_trip_data:
                         correct += 1
     self.assertEqual(
         correct,
         len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE))
Esempio n. 5
0
    def _initialize(self, chunk_cursor=None):
        # To get all events in a chunk
        blob_client = self.client.get_blob_client(self.chunk_path)

        file_offset = chunk_cursor.get('position') if chunk_cursor else 0
        block_count = chunk_cursor.get('block_count') if chunk_cursor else 0

        # An offset means the avro data doesn't have avro header,
        # so only when the data stream has a offset we need header stream to help
        header_stream = ChangeFeedStreamer(
            blob_client) if file_offset else None
        self._data_stream = ChangeFeedStreamer(blob_client,
                                               chunk_file_start=file_offset,
                                               block_count=block_count)
        self.file_reader = DataFileReader(self._data_stream,
                                          DatumReader(),
                                          header_reader=header_stream)

        # After initializing DataFileReader, data_stream cursor has been moved to the data part(DataFileReader read
        # the header part during initialization)
        self._data_stream.event_position = self._data_stream.tell()