def __init__(self, output_file: str, meta_file: str, utc_start_timestamp: int = None): self._output_fp = None self._meta = BinaryMeta() self._meta.from_file(meta_file) self._output_fp = open(output_file, "wb+") self._item_count = 0 self._item_size = self._meta.item_size self._meta_offset = header_struct.size self._meta_size = 0 self._data_offset = 0 self._data_size = 0 self._starttime = 0 self._endtime = 0 # is starttime changed for 1st time self._is_starttime_changed = False # if we have a start timestamp, then use it in binary if utc_start_timestamp is not None: self._starttime = utc_start_timestamp # set it to True so that following logic will not change start time again self._is_starttime_changed = True # write header for 1st time, and meta self._update_header() self._write_meta()
def __init__(self, file_path: str, enable_value_adjust: bool = False, buffer_size: int = 100): self._enable_value_adjust = enable_value_adjust self.header: FileHeader = None self._meta = BinaryMeta() self._buffer_size = buffer_size self._file_fp = None self._mmap: mmap.mmap = None if file_path.startswith("~"): file_path = os.path.expanduser(file_path) self._file_fp = open(file_path, "rb") if sys.platform == "win32": self._mmap = mmap.mmap( self._file_fp.fileno(), 0, access=mmap.ACCESS_READ) else: self._mmap = mmap.mmap( self._file_fp.fileno(), 0, prot=mmap.PROT_READ) self._read_header() self._read_meta() # double buffer to read data self._item_buffer = ItemBuffer( buffer_size, self._meta, enable_value_adjust) # contains starttime offset related file offset, used in items() method # use this to speedup the querying self._starttime_offset_history = {} # move the pointer to data area self._mmap.seek(self.header.data_offset) # data length (in byte) we already loaded, used to check data boundary self._readed_data_size = 0
class BinaryReader: """Read binary file converted by csv converter Examples: .. code-block:: python reader = BinaryReader(bin_file) # read items in between 0-10 minute (relative to binary start time) for item in reader.items(0, 10, time_unit="m"): print(item) # or get a picker that support query by tick sequentially picker = reader.items_tick_picker(0, 10, time_unit="m"): for tick in range(0, 10): for item in picker.items(tick): print(item) Args: file_path(str): binary file path to read enable_value_adjust(bool): if reader should adjust the value of fields that enabled 'value_adjust' feature in meta randomly buffer_size(int): size of in-memory buffer """ def __init__(self, file_path: str, enable_value_adjust: bool = False, buffer_size: int = 100): self._enable_value_adjust = enable_value_adjust self.header: FileHeader = None self._meta = BinaryMeta() self._buffer_size = buffer_size self._file_fp = None self._mmap: mmap.mmap = None if file_path.startswith("~"): file_path = os.path.expanduser(file_path) self._file_fp = open(file_path, "rb") if sys.platform == "win32": self._mmap = mmap.mmap( self._file_fp.fileno(), 0, access=mmap.ACCESS_READ) else: self._mmap = mmap.mmap( self._file_fp.fileno(), 0, prot=mmap.PROT_READ) self._read_header() self._read_meta() # double buffer to read data self._item_buffer = ItemBuffer( buffer_size, self._meta, enable_value_adjust) # contains starttime offset related file offset, used in items() method # use this to speedup the querying self._starttime_offset_history = {} # move the pointer to data area self._mmap.seek(self.header.data_offset) # data length (in byte) we already loaded, used to check data boundary self._readed_data_size = 0 @property def meta(self) -> BinaryMeta: """Meta data in binary file""" return self._meta @property def start_datetime(self) -> datetime: """Start datetime of this file (UTC)""" return self._to_utc_datetime(self.header.starttime) @property def end_datetime(self) -> datetime: """End datetime of this file (UTC)""" return self._to_utc_datetime(self.header.endtime) def items_tick_picker(self, start_time_offset: int = 0, end_time_offset: int = None, time_unit: str = "s"): """Filter items by specified time range, and then pick by tick sequentially Args: start_time_offset(int): specified the which tick (in seconds) to start end_time_offset(int): specified the end tick (in seconds) to start time_unit (str): unit of time used to calculate offset, 's': seconds, 'm': minute, 'h': hour, 'd': day """ item_filter = self.items(start_time_offset, end_time_offset, time_unit) return ItemTickPicker(item_filter, self.header.starttime, time_unit) def items(self, start_time_offset: int = 0, end_time_offset: int = None, time_unit: str = "s"): """Get all items in specified time range Args: start_time_offset(int): specified the which tick (in seconds) to start end_time_offset(int): specified the end tick (in seconds) to start time_unit (str): unit of time used to calculate offset, 's': seconds, 'm': minute, 'h': hour, 'd': day """ # reset to read from beginning self.reset() # default offset offset = self.header.data_offset # time range to filter start_time = calc_time_offset(self.header.starttime, start_time_offset, time_unit) if end_time_offset is None: end_time = self.header.endtime else: end_time = calc_time_offset( self.header.starttime, end_time_offset, time_unit) # check if we have used this filter has_filter_history = False if start_time_offset in self._starttime_offset_history: has_filter_history = True offset = self._starttime_offset_history[start_time_offset] # fulfill buffer for first time using # seek to the data part to go through all the items self._mmap.seek(offset) self._fulfill_buffer() pre_mmap_offset = self._mmap.tell() while True: # read and return an item from buffer buffer = self._item_buffer if buffer.item_number == 0: break is_finished = False for item in buffer.items(): if start_time <= item.timestamp <= end_time: # record the filter history if not has_filter_history: has_filter_history = True # return to the start of the buffer pos = pre_mmap_offset - buffer.item_number * self._meta.item_size self._starttime_offset_history[start_time_offset] = pos yield item if item.timestamp > end_time: is_finished = True return if not is_finished: # then start another one pre_mmap_offset = self._mmap.tell() self._fulfill_buffer() else: break def reset(self): """Reset binary reader""" self._readed_data_size = 0 def __del__(self): """Clear resources""" self.close() def close(self): """Close file""" if self._mmap and not self._mmap.closed: self._mmap.close() self._mmap = None if self._file_fp and not self._file_fp.closed: self._file_fp.close() self._file_fp = None def _to_utc_datetime(self, timestamp: int): """Convert timestamp into datetime""" # TODO: make it as a common method if sys.platform == "win32": return (timestamp_start + relativedelta(seconds=timestamp)) else: return datetime.utcfromtimestamp(timestamp).replace(tzinfo=UTC) def _read_header(self): """Read header part""" header_bytes = memoryview(self._mmap[0:header_struct.size]) self.header = FileHeader._make(header_struct.unpack_from(header_bytes)) # validate header # if current version less than file, then a warning if VERSION < self.header.version: warnings.warn( f"File version is greater than current reader version, may cause unknown behavior!.") def _read_meta(self): """Read meta part""" meta_bytes = self._mmap[self.header.meta_offset: self.header.meta_offset + self.header.meta_size] self._meta.from_bytes(meta_bytes) def _fulfill_buffer(self): """fulfill buffer from file""" buffer = self._item_buffer size_to_read = self._meta.item_size * self._buffer_size remaining_size = self.header.data_size - self._readed_data_size size_to_read = min(size_to_read, remaining_size) if size_to_read <= 0: buffer.write(None) else: item_bytes = self._mmap.read(size_to_read) self._readed_data_size += len(item_bytes) buffer.write(item_bytes)
class BinaryConverter: """Convert csv file into binary with specified meta. The output binary file composed with: 1. header: file type, start/end time etc. 2. meta: meta content after validation 3. items Args: output_file(str): output binary file full path meta_file(str): path to the meta file (yaml) utc_start_timestamp(int): start timestamp in UTC which will be considered as tick 0, used to adjust the data reader pipeline """ def __init__(self, output_file: str, meta_file: str, utc_start_timestamp: int = None): self._output_fp = None self._meta = BinaryMeta() self._meta.from_file(meta_file) self._output_fp = open(output_file, "wb+") self._item_count = 0 self._item_size = self._meta.item_size self._meta_offset = header_struct.size self._meta_size = 0 self._data_offset = 0 self._data_size = 0 self._starttime = 0 self._endtime = 0 # is starttime changed for 1st time self._is_starttime_changed = False # if we have a start timestamp, then use it in binary if utc_start_timestamp is not None: self._starttime = utc_start_timestamp # set it to True so that following logic will not change start time again self._is_starttime_changed = True # write header for 1st time, and meta self._update_header() self._write_meta() def add_csv(self, csv_file: str): """Convert specified csv file into current binary file, this converter will not sort the item. This method can be called several times to convert multiple csv file into one binary, the order will be same as calling sequence. Args: csv_file(str): csv to convert """ with open(csv_file, newline='') as csv_fp: reader = DictReader(csv_fp) # write items self._write_items(reader) def flush(self): """Flush the result into output file""" self._update_header() def __del__(self): # resource collecting if self._output_fp is not None and not self._output_fp.closed: self.flush() self._output_fp.flush() self._output_fp.close() def _update_header(self): """Update file header""" header_bytes = header_struct.pack( b"MARO", SINGLE_BIN_FILE_TYPE, VERSION, self._item_count, self._item_size, self._meta_offset, self._meta_size, self._data_offset, self._data_size, self._starttime, self._endtime ) self._meta_offset = len(header_bytes) self._output_fp.seek(0, 0) # seek the output file beginning self._output_fp.write(header_bytes) self._output_fp.seek(0, 2) # seek to the file end def _write_meta(self): """Write file meta""" meta_bytes = self._meta.to_bytes() # update header info self._data_offset = self._meta_offset + len(meta_bytes) self._meta_size = len(meta_bytes) self._output_fp.write(meta_bytes) def _write_items(self, reader: DictReader): """Write items into binary""" columns = self._meta.columns # columns need to convert values = [0] * len(columns.keys()) # values buffer from each row, used to pack into binary buffer = memoryview(bytearray(self._meta.item_size)) # item binary buffer field_type_dict = self._meta.items() # field -> data type has_invalid_column = False # some column's value may cannot be parse, will skip it for row in reader: field_index = 0 # has_invalid_column = False # clear the values for j in range(len(values)): values[j] = 0 # read from current row for field, dtype in field_type_dict.items(): column_name = columns[field] # NOTE: we allow field not exist in csv file, the value will be zero if column_name in row: val = convert_val(row[column_name], dtype, self._meta.time_zone) values[field_index] = val if val is None: has_invalid_column = True break # keep the start and end tick if field == "timestamp": if not self._is_starttime_changed: self._is_starttime_changed = True self._starttime = val else: self._starttime = min(self._starttime, val) self._endtime =max(val, self._endtime) field_index += 1 if not has_invalid_column: # convert item into bytes buffer, and write to file self._meta.item_to_bytes(values, buffer) self._output_fp.write(buffer) # update header fields for final update self._item_count += 1 self._data_size += self._item_size