def test_validation_errors(self): tb = example_time_box('') tb._tags = { 0: TimeBoxTag(0, 1, 'u'), 1: TimeBoxTag(1, 2, 'i'), 2: TimeBoxTag(2, 4, 'f') } with self.assertRaises(DataDoesNotMatchTagDefinitionError): tb._validate_data_for_write() tb = example_time_box('') tb._tags[0].data = None with self.assertRaises(DataDoesNotMatchTagDefinitionError): tb._validate_data_for_write() tb = example_time_box('') tb._tags[0].dtype = None with self.assertRaises(DataDoesNotMatchTagDefinitionError): tb._validate_data_for_write() tb._tags[0].dtype = np.uint8 tb._tags[0].data = np.array([1], dtype=np.uint8) with self.assertRaises(DataShapeError): tb._validate_data_for_write() return
def test_timebox_floating_point_rounding(self): t = TimeBoxTag(0, 8, 'f') t.use_compression = True t.floating_point_rounded = True t.num_decimals_to_store = 2 t.data = np.array([0.5, -0.5, 10.2345, 0], np.float64) t.encode_data() self.assertEqual('m', t._compression_mode) self.assertEqual('u', t._compressed_type_char) self.assertEqual(2, t._compressed_bytes_per_value) self.assertEqual(-50, t._compression_reference_value) self.assertEqual(np.int64, t._compression_reference_value_dtype) self.assertEqual(100, t._encoded_data[0]) self.assertEqual(0, t._encoded_data[1]) self.assertEqual(1023 + 50, t._encoded_data[2]) self.assertEqual(50, t._encoded_data[3]) t._decode_data() self.assertEqual(np.float64, t.data.dtype) self.assertEqual(0.5, t.data[0]) self.assertEqual(-0.5, t.data[1]) self.assertEqual(10.23, t.data[2]) self.assertEqual(0, t.data[3]) return
def example_tag_definitions(): return { 0: TimeBoxTag(0, 1, 'u'), 1: TimeBoxTag(1, 2, 'i'), 2: TimeBoxTag(2, 4, 'f'), 255: TimeBoxTag(255, 8, 'i'), 256: TimeBoxTag(256, 8, 'f') }
def test_tag_to_bytes(self): t = TimeBoxTag(1, 8, 'u', options=0) t_byte_result = t.info_to_bytes(1, False) self.assertEqual(41, t_byte_result[0]) t_bytes = t_byte_result.byte_code self.assertEqual(1, t_bytes[0]) # identifier self.assertEqual(0, t_bytes[1]) # options, byte 1 self.assertEqual(0, t_bytes[2]) # options, byte 2 self.assertEqual(8, t_bytes[3]) # bytes per value self.assertEqual(117, t_bytes[4]) # type char self.assertEqual(b'\x00\x00\x00\x00', t_bytes[5:9]) # num bytes extra info return
def test_encode_decode_def_bytes_floating_point_rounding(self): t = TimeBoxTag(0, 8, 'f', options=0) encoded_bytes = t._encode_def_bytes() self.assertEqual(encoded_bytes, b''.join([b'\x00' for _ in range(0, 32)])) t.floating_point_rounded = True t.num_decimals_to_store = 2 encoded_bytes = t._encode_def_bytes() self.assertEqual(2, encoded_bytes[0]) t = TimeBoxTag(0, 8, 'f', options=4) t._decode_def_bytes(encoded_bytes) self.assertTrue(t.floating_point_rounded) self.assertEqual(2, t.num_decimals_to_store) return
def from_pandas(cls, df: pd.DataFrame): """ Expects that the passing df has an index that is type Timestamp or string which can be converted to Timestamp. All dtypes in pandas data frame must be in the float/int/u-int family :param df: pandas DataFrame :return: TimeBox object """ # make sure the pandas data frame is sorted on date logging.debug('Before sorting: {}'.format(df.head())) df = df.sort_index() logging.debug('After sorting: {}'.format(df.head())) tb = TimeBox() tb._tag_names_are_strings = True # ensure index is there and can be converted to numpy array of datetime64s logging.debug('Datetime index dtype before and after:\n{}'.format( df.index.dtype)) tb._dates = df.index.values.astype(np.datetime64) logging.debug('after: {}'.format(tb._dates.dtype)) tb._start_date = np.amin(tb._dates.astype(np.dtype('datetime64[s]'))) logging.debug('Min date: {}'.format(tb._start_date)) tb._date_differentials_stored = True tb._num_points = tb._dates.size # get column names and info for c in df.columns: type_info = parse_pandas_dtype(df[c].dtype) tb._tags[c] = TimeBoxTag(c, type_info[0], type_info[1]) tb._tags[c].data = df[c].values return tb
def test_timebox_tag_decompression(self): t = TimeBoxTag(0, 8, 'u') t.use_compression = True t._encoded_data = np.array([0, 1, 2, 5], np.uint8) t._compression_mode = 'm' t._compressed_bytes_per_value = 1 t._compressed_type_char = 'u' t._compression_reference_value = 1000000 t._decode_data() self.assertEqual(4, t.data.size) self.assertEqual(8, t.data.itemsize) self.assertEqual(1000000, t.data[0]) self.assertEqual(1000001, t.data[1]) self.assertEqual(1000002, t.data[2]) self.assertEqual(1000005, t.data[3]) return
def test_tag_definitions_to_from_bytes_integer(self): first = TimeBox('') first._tag_names_are_strings = False first._tags = example_tag_definitions() first._update_required_bytes_for_tag_identifier() tags_bytes_results = TimeBoxTag.tag_list_to_bytes( [first._tags[t] for t in first._tags], first._num_bytes_for_tag_identifier, first._tag_names_are_strings) self.assertEqual(5 * (2 + NUM_BYTES_PER_DEFINITION_WITHOUT_IDENTIFIER), tags_bytes_results.num_bytes) self.assertEqual( 813, np.frombuffer(tags_bytes_results.byte_code, dtype=np.uint8).sum()) second = TimeBox('') second._num_bytes_for_tag_identifier = 2 second._tag_names_are_strings = False second._tag_definitions = TimeBoxTag.tag_definitions_from_bytes( tags_bytes_results.byte_code, second._num_bytes_for_tag_identifier, second._tag_names_are_strings) second_tag_bytes_results = TimeBoxTag.tag_list_to_bytes( [second._tag_definitions[t] for t in second._tag_definitions], second._num_bytes_for_tag_identifier, second._tag_names_are_strings) self.assertEqual(tags_bytes_results.num_bytes, second_tag_bytes_results.num_bytes) self.assertEqual(tags_bytes_results.byte_code, second_tag_bytes_results.byte_code) second._num_bytes_for_tag_identifier = 4 bad_bytes = TimeBoxTag.tag_list_to_bytes( [second._tag_definitions[t] for t in second._tag_definitions], second._num_bytes_for_tag_identifier, second._tag_names_are_strings) self.assertNotEqual(second_tag_bytes_results.byte_code, bad_bytes.byte_code) second._tag_names_are_strings = True bad_bytes = TimeBoxTag.tag_list_to_bytes( [second._tag_definitions[t] for t in second._tag_definitions], second._num_bytes_for_tag_identifier, second._tag_names_are_strings) self.assertNotEqual(second_tag_bytes_results.byte_code, bad_bytes.byte_code) return
def example_time_box(file_name: str): tb = TimeBox(file_name) tb._timebox_version = 1 tb._tag_names_are_strings = True tb._date_differentials_stored = False tb._num_points = 4 tb._tags = { 'tag_0': TimeBoxTag('tag_0', 1, 'u'), 'tag_1': TimeBoxTag('tag_1', 2, 'i'), 'tag_2_long_name': TimeBoxTag('tag_2_long_name', 4, 'f') } tb._start_date = np.datetime64('2018-01-01', 's') tb._seconds_between_points = 3600 tb._tags['tag_0'].data = np.array([1, 2, 3, 4], dtype=np.uint8) tb._tags['tag_1'].data = np.array([-4, -2, 0, 2000], dtype=np.int16) tb._tags['tag_2_long_name'].data = np.array([5.2, 0.8, 3.1415, 8], dtype=np.float32) return tb
def test_update_required_bytes(self): tb = TimeBox('') tb._tag_names_are_strings = False tb._tags[0] = TimeBoxTag(0, 1, 'u') tb._update_required_bytes_for_tag_identifier() self.assertEqual(1, tb._num_bytes_for_tag_identifier) tb._tags[256] = TimeBoxTag(256, 1, 'u') tb._update_required_bytes_for_tag_identifier() self.assertEqual(2, tb._num_bytes_for_tag_identifier) tb._tag_names_are_strings = True tb._tags = { 'a': TimeBoxTag('a', 1, 'u'), 'ab': TimeBoxTag('ab', 1, 'u'), 'abc': TimeBoxTag('abc', 1, 'u') } tb._update_required_bytes_for_tag_identifier() self.assertEqual(12, tb._num_bytes_for_tag_identifier) return
def example_time_box(file_name: str): tb = TimeBox(file_name) tb._timebox_version = 1 tb._tag_names_are_strings = False tb._date_differentials_stored = True tb._num_points = 4 tb._tags = { 0: TimeBoxTag(0, 1, 'u'), 1: TimeBoxTag(1, 2, 'i'), 2: TimeBoxTag(2, 4, 'f') } tb._start_date = np.datetime64('2018-01-01', 's') tb._tags[0].data = np.array([1, 2, 3, 4], dtype=np.uint8) tb._tags[1].data = np.array([-4, -2, 0, 2000], dtype=np.int16) tb._tags[2].data = np.array([5.2, 0.8, 3.1415, 8], dtype=np.float32) tb._date_differentials = np.array([1, 1, 1], dtype=np.uint8) tb._date_differential_units = DAYS tb._bytes_per_date_differential = 1 return tb
def _write_file_info(self, file_handle) -> int: """ Writes out the file info to the file handle :param file_handle: file handle object in 'wb' mode. pre-seeked to correct position (0) :return: int, seek bytes advanced in this method """ np.array([np.uint8(self._timebox_version)], dtype=np.uint8).tofile(file_handle) np.array([np.uint16(self._encode_options())], dtype=np.uint16).tofile(file_handle) np.array([np.uint8(len(self._tags))], dtype=np.uint8).tofile(file_handle) np.array([np.uint32(self._num_points)], dtype=np.uint32).tofile(file_handle) self._update_required_bytes_for_tag_identifier() np.array([np.uint8(self._num_bytes_for_tag_identifier)], dtype=np.uint8).tofile(file_handle) bytes_seek = 1 + 2 + 1 + 4 + 1 sorted_tags = sorted([t for t in self._tags]) tags_to_bytes_result = TimeBoxTag.tag_list_to_bytes( [self._tags[t] for t in sorted_tags], self._num_bytes_for_tag_identifier, self._tag_names_are_strings) file_handle.write(tags_to_bytes_result.byte_code) bytes_seek += tags_to_bytes_result.num_bytes np.array([np.datetime64(self._start_date, dtype='datetime64[s]')]).tofile(file_handle) bytes_seek += 8 if self._date_differentials_stored: np.array([np.uint8(self._bytes_per_date_differential)], dtype=np.uint8).tofile(file_handle) int_to_store_date_diff_units = get_int_for_date_units_from_date_utils_constant( self._date_differential_units) np.array([np.uint16(int_to_store_date_diff_units)], dtype=np.uint16).tofile(file_handle) bytes_seek += 3 else: np.array([np.uint32(self._seconds_between_points)], dtype=np.uint32).tofile(file_handle) bytes_seek += 4 return bytes_seek
def _read_file_info(self, file_handle) -> int: """ Reads the file info from a file_handle. Populates file internals :param file_handle: file handle object in 'rb' mode that is seeked to the correct position (0) :return: int, seek bytes increased since file_handle was received """ self._timebox_version = read_unsigned_int(file_handle.read(1)) self._unpack_options(int(read_unsigned_int(file_handle.read(2)))) num_tags = read_unsigned_int(file_handle.read(1)) self._num_points = read_unsigned_int(file_handle.read(4)) self._num_bytes_for_tag_identifier = read_unsigned_int( file_handle.read(1)) bytes_seek = 1 + 2 + 1 + 4 + 1 # first 2 bytes are info on the tag bytes_for_tag_def = num_tags * ( self._num_bytes_for_tag_identifier + NUM_BYTES_PER_DEFINITION_WITHOUT_IDENTIFIER) self._tags = TimeBoxTag.tag_definitions_from_bytes( file_handle.read(bytes_for_tag_def), self._num_bytes_for_tag_identifier, self._tag_names_are_strings) bytes_seek += bytes_for_tag_def self._start_date = np.fromfile(file_handle, dtype='datetime64[s]', count=1)[0] bytes_seek += 8 if self._date_differentials_stored: self._seconds_between_points = 0 self._bytes_per_date_differential = read_unsigned_int( file_handle.read(1)) stored_value_for_date_diff_units = read_unsigned_int( file_handle.read(2)) self._date_differential_units = get_date_utils_constant_from_stored_units_int( stored_value_for_date_diff_units) bytes_seek += 3 else: self._seconds_between_points = read_unsigned_int( file_handle.read(4)) self._bytes_per_date_differential = 0 self._date_differential_units = 0 bytes_seek += 4 return bytes_seek
def test_timebox_tag_compression(self): t = TimeBoxTag(0, 8, 'u') t.use_compression = True t.data = np.array([1000000, 1000001, 1000002, 1000005], np.uint64) t.encode_data() self.assertEqual('m', t._compression_mode) self.assertEqual('u', t._compressed_type_char) self.assertEqual(1, t._compressed_bytes_per_value) self.assertEqual(1000000, t._compression_reference_value) self.assertEqual(np.uint64, t._compression_reference_value_dtype) self.assertEqual(0, t._encoded_data[0]) self.assertEqual(1, t._encoded_data[1]) self.assertEqual(2, t._encoded_data[2]) self.assertEqual(5, t._encoded_data[3]) return
def test_tag_info_init(self): tag_info = TimeBoxTag('my_id', 4, 'f') self.assertEqual('my_id', tag_info.identifier) self.assertEqual(4, tag_info.bytes_per_value) self.assertEqual('f', tag_info.type_char) self.assertEqual(np.float32, tag_info.dtype) tag_info = TimeBoxTag('my_id', 4, ord('f')) self.assertEqual(np.float32, tag_info.dtype) self.assertEqual(None, tag_info.data) self.assertEqual(None, tag_info._encoded_data) self.assertEqual(None, tag_info.num_points) self.assertFalse(tag_info.use_compression) self.assertFalse(tag_info.use_hash_table) self.assertFalse(tag_info.floating_point_rounded) self.assertEqual(None, tag_info._compressed_type_char) self.assertEqual(None, tag_info._compressed_bytes_per_value) self.assertEqual(None, tag_info._compression_mode) self.assertEqual(None, tag_info._compression_reference_value) self.assertEqual(tag_info.dtype, tag_info._compression_reference_value_dtype) self.assertEqual(None, tag_info.num_decimals_to_store) self.assertEqual(0, tag_info.num_bytes_extra_information) tag_info = TimeBoxTag('my_id', 4, 'f', options=1) self.assertTrue(tag_info.use_compression) tag_info = TimeBoxTag('my_id', 4, 'f', options=3) self.assertTrue(tag_info.use_hash_table) tag_info = TimeBoxTag('my_id', 4, 'f', options=2) self.assertTrue(tag_info.use_hash_table) self.assertFalse(tag_info.use_compression) tag_info = TimeBoxTag('my_id', 4, 'f', options=0, untyped_bytes=b''.join( [b'\x00' for _ in range(0, 32)])) return
def test_encode_decode_def_bytes_compression(self): t = TimeBoxTag(0, 8, 'u', options=0) encoded_bytes = t._encode_def_bytes() self.assertEqual(encoded_bytes, b''.join([b'\x00' for _ in range(0, 32)])) t.use_compression = True t._compression_mode = 'e' t._compressed_bytes_per_value = 2 t._compressed_type_char = 'u' t._compression_reference_value = 5 t._compression_reference_value_dtype = np.dtype(np.uint64) encoded_bytes = t._encode_def_bytes() self.assertEqual(101, encoded_bytes[0]) self.assertEqual(2, encoded_bytes[1]) self.assertEqual(117, encoded_bytes[2]) self.assertEqual(8, encoded_bytes[3]) self.assertEqual(117, encoded_bytes[4]) self.assertEqual(5, encoded_bytes[5]) t = TimeBoxTag(0, 8, 'u', options=1) t._decode_def_bytes(encoded_bytes) self.assertTrue(t.use_compression) self.assertEqual('e', t._compression_mode) self.assertEqual(2, t._compressed_bytes_per_value) self.assertEqual('u', t._compressed_type_char) self.assertEqual(5, t._compression_reference_value) return
def test_tag_options(self): t = TimeBoxTag(1, 8, 'u', options=0) t.use_compression = False t.use_hash_table = False t.floating_point_rounded = False self.assertEqual(0, t._encode_options()) t.use_compression = True t.use_hash_table = False t.floating_point_rounded = False self.assertEqual(1, t._encode_options()) t.use_compression = False t.use_hash_table = True t.floating_point_rounded = False self.assertEqual(2, t._encode_options()) t.use_compression = True t.use_hash_table = True t.floating_point_rounded = False self.assertEqual(3, t._encode_options()) t.use_compression = False t.use_hash_table = False t.floating_point_rounded = True self.assertEqual(4, t._encode_options()) t.use_compression = True t.use_hash_table = False t.floating_point_rounded = True self.assertEqual(5, t._encode_options()) t.use_compression = False t.use_hash_table = True t.floating_point_rounded = True self.assertEqual(6, t._encode_options()) t.use_compression = True t.use_hash_table = True t.floating_point_rounded = True self.assertEqual(7, t._encode_options()) t.use_compression = True t.use_hash_table = True t.floating_point_rounded = False self.assertEqual(3, t._encode_options()) t._decode_options(0) self.assertFalse(t.use_compression) self.assertFalse(t.use_hash_table) self.assertFalse(t.floating_point_rounded) t._decode_options(1) self.assertTrue(t.use_compression) self.assertFalse(t.use_hash_table) self.assertFalse(t.floating_point_rounded) t._decode_options(2) self.assertFalse(t.use_compression) self.assertTrue(t.use_hash_table) self.assertFalse(t.floating_point_rounded) t._decode_options(3) self.assertTrue(t.use_compression) self.assertTrue(t.use_hash_table) self.assertFalse(t.floating_point_rounded) t._decode_options(4) self.assertFalse(t.use_compression) self.assertFalse(t.use_hash_table) self.assertTrue(t.floating_point_rounded) t._decode_options(5) self.assertTrue(t.use_compression) self.assertFalse(t.use_hash_table) self.assertTrue(t.floating_point_rounded) t._decode_options(6) self.assertFalse(t.use_compression) self.assertTrue(t.use_hash_table) self.assertTrue(t.floating_point_rounded) t._decode_options(7) self.assertTrue(t.use_compression) self.assertTrue(t.use_hash_table) self.assertTrue(t.floating_point_rounded) return
def test_get_tag_info_dtype(self): actual = TimeBoxTag.tag_info_dtype(4, True) self.assertEqual('tag_identifier', actual.descr[0][0]) self.assertEqual('<U1', actual.descr[0][1]) self.assertEqual('options', actual.descr[1][0]) self.assertEqual('<u2', actual.descr[1][1]) self.assertEqual('bytes_per_point', actual.descr[2][0]) self.assertEqual('|u1', actual.descr[2][1]) self.assertEqual('type_char', actual.descr[3][0]) self.assertEqual('|u1', actual.descr[3][1]) self.assertEqual('bytes_extra_information', actual.descr[4][0]) self.assertEqual('<u4', actual.descr[4][1]) for i in range(0, 32): self.assertEqual('def_byte_{}'.format(i + 1), actual.descr[5 + i][0]) self.assertEqual('|u1', actual.descr[5 + i][1]) actual = TimeBoxTag.tag_info_dtype(16, True) self.assertEqual('<U4', actual.descr[0][1]) actual = TimeBoxTag.tag_info_dtype(32, True) self.assertEqual('<U8', actual.descr[0][1]) actual = TimeBoxTag.tag_info_dtype(128, True) self.assertEqual('<U32', actual.descr[0][1]) actual = TimeBoxTag.tag_info_dtype(1, False) self.assertEqual('|u1', actual.descr[0][1]) actual = TimeBoxTag.tag_info_dtype(2, False) self.assertEqual('<u2', actual.descr[0][1]) actual = TimeBoxTag.tag_info_dtype(4, False) self.assertEqual('<u4', actual.descr[0][1]) actual = TimeBoxTag.tag_info_dtype(8, False) self.assertEqual('<u8', actual.descr[0][1]) # test errors with self.assertRaises(TagIdentifierByteRepresentationError): TimeBoxTag.tag_info_dtype(2, True) with self.assertRaises(TagIdentifierByteRepresentationError): TimeBoxTag.tag_info_dtype(0, True) with self.assertRaises(TagIdentifierByteRepresentationError): TimeBoxTag.tag_info_dtype(-1, True) with self.assertRaises(ValueError): TimeBoxTag.tag_info_dtype(0.5, False) return