Example #1
0
    def __init__(self, output_file: str, meta_file: str, utc_start_timestamp: int = None):
        self._output_fp = None
        self._meta = BinaryMeta()
        self._meta.from_file(meta_file)

        self._output_fp = open(output_file, "wb+")

        self._item_count = 0
        self._item_size = self._meta.item_size
        self._meta_offset = header_struct.size
        self._meta_size = 0
        self._data_offset = 0
        self._data_size = 0
        self._starttime = 0
        self._endtime = 0

        # is starttime changed for 1st time
        self._is_starttime_changed = False

        # if we have a start timestamp, then use it in binary
        if utc_start_timestamp is not None:
            self._starttime = utc_start_timestamp

            # set it to True so that following logic will not change start time again
            self._is_starttime_changed = True

        # write header for 1st time, and meta
        self._update_header()
        self._write_meta()
Example #2
0
    def __init__(self, file_path: str, enable_value_adjust: bool = False, buffer_size: int = 100):
        self._enable_value_adjust = enable_value_adjust

        self.header: FileHeader = None
        self._meta = BinaryMeta()

        self._buffer_size = buffer_size
        self._file_fp = None
        self._mmap: mmap.mmap = None
        if file_path.startswith("~"):
            file_path = os.path.expanduser(file_path)
        self._file_fp = open(file_path, "rb")

        if sys.platform == "win32":
            self._mmap = mmap.mmap(
                self._file_fp.fileno(), 0, access=mmap.ACCESS_READ)
        else:
            self._mmap = mmap.mmap(
                self._file_fp.fileno(), 0, prot=mmap.PROT_READ)

        self._read_header()
        self._read_meta()

        # double buffer to read data
        self._item_buffer = ItemBuffer(
            buffer_size, self._meta, enable_value_adjust)

        # contains starttime offset related file offset, used in items() method
        # use this to speedup the querying
        self._starttime_offset_history = {}

        # move the pointer to data area
        self._mmap.seek(self.header.data_offset)

        # data length (in byte) we already loaded, used to check data boundary
        self._readed_data_size = 0
Example #3
0
class BinaryReader:
    """Read binary file converted by csv converter

    Examples:

        .. code-block:: python
        
            reader = BinaryReader(bin_file)

            # read items in between 0-10 minute (relative to binary start time)
            for item in reader.items(0, 10, time_unit="m"):
                print(item)

            # or get a picker that support query by tick sequentially
            picker = reader.items_tick_picker(0, 10, time_unit="m"):

            for tick in range(0, 10):
                for item in picker.items(tick):
                    print(item)

    Args:
        file_path(str): binary file path to read
        enable_value_adjust(bool): if reader should adjust the value of fields that enabled 'value_adjust' feature in meta randomly
        buffer_size(int): size of in-memory buffer
    """

    def __init__(self, file_path: str, enable_value_adjust: bool = False, buffer_size: int = 100):
        self._enable_value_adjust = enable_value_adjust

        self.header: FileHeader = None
        self._meta = BinaryMeta()

        self._buffer_size = buffer_size
        self._file_fp = None
        self._mmap: mmap.mmap = None
        if file_path.startswith("~"):
            file_path = os.path.expanduser(file_path)
        self._file_fp = open(file_path, "rb")

        if sys.platform == "win32":
            self._mmap = mmap.mmap(
                self._file_fp.fileno(), 0, access=mmap.ACCESS_READ)
        else:
            self._mmap = mmap.mmap(
                self._file_fp.fileno(), 0, prot=mmap.PROT_READ)

        self._read_header()
        self._read_meta()

        # double buffer to read data
        self._item_buffer = ItemBuffer(
            buffer_size, self._meta, enable_value_adjust)

        # contains starttime offset related file offset, used in items() method
        # use this to speedup the querying
        self._starttime_offset_history = {}

        # move the pointer to data area
        self._mmap.seek(self.header.data_offset)

        # data length (in byte) we already loaded, used to check data boundary
        self._readed_data_size = 0

    @property
    def meta(self) -> BinaryMeta:
        """Meta data in binary file"""
        return self._meta

    @property
    def start_datetime(self) -> datetime:
        """Start datetime of this file (UTC)"""
        return self._to_utc_datetime(self.header.starttime)

    @property
    def end_datetime(self) -> datetime:
        """End datetime of this file (UTC)"""
        return self._to_utc_datetime(self.header.endtime)

    def items_tick_picker(self, start_time_offset: int = 0, end_time_offset: int = None, time_unit: str = "s"):
        """Filter items by specified time range, and then pick by tick sequentially        
        
        Args:
            start_time_offset(int): specified the which tick (in seconds) to start
            end_time_offset(int): specified the end tick (in seconds) to start
            time_unit (str): unit of time used to calculate offset, 's': seconds, 'm': minute, 'h': hour, 'd': day
        """
        item_filter = self.items(start_time_offset, end_time_offset, time_unit)

        return ItemTickPicker(item_filter, self.header.starttime, time_unit)

    def items(self, start_time_offset: int = 0, end_time_offset: int = None, time_unit: str = "s"):
        """Get all items in specified time range

        Args:
            start_time_offset(int): specified the which tick (in seconds) to start
            end_time_offset(int): specified the end tick (in seconds) to start
            time_unit (str): unit of time used to calculate offset, 's': seconds, 'm': minute, 'h': hour, 'd': day
        """
        # reset to read from beginning
        self.reset()

        # default offset
        offset = self.header.data_offset

        # time range to filter
        start_time = calc_time_offset(self.header.starttime, start_time_offset, time_unit)

        if end_time_offset is None:
            end_time = self.header.endtime
        else:
            end_time = calc_time_offset(
                self.header.starttime, end_time_offset, time_unit)

        # check if we have used this filter
        has_filter_history = False

        if start_time_offset in self._starttime_offset_history:
            has_filter_history = True

            offset = self._starttime_offset_history[start_time_offset]

        # fulfill buffer for first time using
        # seek to the data part to go through all the items
        self._mmap.seek(offset)

        self._fulfill_buffer()

        pre_mmap_offset = self._mmap.tell()

        while True:
            # read and return an item from buffer
            buffer = self._item_buffer

            if buffer.item_number == 0:
                break

            is_finished = False

            for item in buffer.items():
                if start_time <= item.timestamp <= end_time:
                    # record the filter history
                    if not has_filter_history:
                        has_filter_history = True

                        # return to the start of the buffer
                        pos = pre_mmap_offset - buffer.item_number * self._meta.item_size

                        self._starttime_offset_history[start_time_offset] = pos

                    yield item

                if item.timestamp > end_time:
                    is_finished = True
                    return

            if not is_finished:
                # then start another one
                pre_mmap_offset = self._mmap.tell()

                self._fulfill_buffer()
            else:
                break

    def reset(self):
        """Reset binary reader"""
        self._readed_data_size = 0

    def __del__(self):
        """Clear resources"""
        self.close()

    def close(self):
        """Close file"""
        if self._mmap and not self._mmap.closed:
            self._mmap.close()

            self._mmap = None

        if self._file_fp and not self._file_fp.closed:
            self._file_fp.close()

            self._file_fp = None

    def _to_utc_datetime(self, timestamp: int):
        """Convert timestamp into datetime"""

        # TODO: make it as a common method
        if sys.platform == "win32":
            return (timestamp_start + relativedelta(seconds=timestamp))
        else:
            return datetime.utcfromtimestamp(timestamp).replace(tzinfo=UTC)

    def _read_header(self):
        """Read header part"""
        header_bytes = memoryview(self._mmap[0:header_struct.size])

        self.header = FileHeader._make(header_struct.unpack_from(header_bytes))

        # validate header
        # if current version less than file, then a warning
        if VERSION < self.header.version:
            warnings.warn(
                f"File version is greater than current reader version, may cause unknown behavior!.")

    def _read_meta(self):
        """Read meta part"""
        meta_bytes = self._mmap[self.header.meta_offset:
                                self.header.meta_offset + self.header.meta_size]

        self._meta.from_bytes(meta_bytes)

    def _fulfill_buffer(self):
        """fulfill buffer from file"""

        buffer = self._item_buffer

        size_to_read = self._meta.item_size * self._buffer_size
        remaining_size = self.header.data_size - self._readed_data_size

        size_to_read = min(size_to_read, remaining_size)

        if size_to_read <= 0:
            buffer.write(None)
        else:
            item_bytes = self._mmap.read(size_to_read)

            self._readed_data_size += len(item_bytes)

            buffer.write(item_bytes)
Example #4
0
class BinaryConverter:
    """Convert csv file into binary with specified meta.

    The output binary file composed with:

    1. header: file type, start/end time etc.
    2. meta: meta content after validation
    3. items

    Args:
        output_file(str): output binary file full path
        meta_file(str): path to the meta file (yaml)
        utc_start_timestamp(int): start timestamp in UTC which will be considered as tick 0, used to adjust the data reader pipeline

    """
    def __init__(self, output_file: str, meta_file: str, utc_start_timestamp: int = None):
        self._output_fp = None
        self._meta = BinaryMeta()
        self._meta.from_file(meta_file)

        self._output_fp = open(output_file, "wb+")

        self._item_count = 0
        self._item_size = self._meta.item_size
        self._meta_offset = header_struct.size
        self._meta_size = 0
        self._data_offset = 0
        self._data_size = 0
        self._starttime = 0
        self._endtime = 0

        # is starttime changed for 1st time
        self._is_starttime_changed = False

        # if we have a start timestamp, then use it in binary
        if utc_start_timestamp is not None:
            self._starttime = utc_start_timestamp

            # set it to True so that following logic will not change start time again
            self._is_starttime_changed = True

        # write header for 1st time, and meta
        self._update_header()
        self._write_meta()

    def add_csv(self, csv_file: str):
        """Convert specified csv file into current binary file, this converter will not sort the item.
        This method can be called several times to convert multiple csv file into one binary, the order will be same as calling sequence.

        Args:
            csv_file(str): csv to convert
        """
        with open(csv_file, newline='') as csv_fp:
            reader = DictReader(csv_fp)

            # write items
            self._write_items(reader)

    def flush(self):
        """Flush the result into output file"""
        self._update_header()

    def __del__(self):
        # resource collecting
        if self._output_fp is not None and not self._output_fp.closed:
            self.flush()

            self._output_fp.flush()
            self._output_fp.close()

    def _update_header(self):
        """Update file header"""
        header_bytes = header_struct.pack(
            b"MARO",
            SINGLE_BIN_FILE_TYPE,
            VERSION,
            self._item_count,
            self._item_size,
            self._meta_offset,
            self._meta_size,
            self._data_offset,
            self._data_size,
            self._starttime,
            self._endtime
        )

        self._meta_offset = len(header_bytes)

        self._output_fp.seek(0, 0) # seek the output file beginning
        self._output_fp.write(header_bytes)
        self._output_fp.seek(0, 2) # seek to the file end

    def _write_meta(self):
        """Write file meta"""
        meta_bytes = self._meta.to_bytes()

        # update header info
        self._data_offset = self._meta_offset + len(meta_bytes)
        self._meta_size = len(meta_bytes)

        self._output_fp.write(meta_bytes)

    def _write_items(self, reader: DictReader):
        """Write items into binary"""
        columns = self._meta.columns # columns need to convert
        values = [0] * len(columns.keys()) # values buffer from each row, used to pack into binary
        buffer = memoryview(bytearray(self._meta.item_size)) # item binary buffer

        field_type_dict = self._meta.items() # field -> data type

        has_invalid_column = False # some column's value may cannot be parse, will skip it

        for row in reader:
            field_index = 0 #
            has_invalid_column = False

            # clear the values
            for j in range(len(values)):
                values[j] = 0
            
            # read from current row
            for field, dtype in field_type_dict.items():
                column_name = columns[field]

                # NOTE: we allow field not exist in csv file, the value will be zero
                if column_name in row:
                    val = convert_val(row[column_name], dtype, self._meta.time_zone)
                    values[field_index] = val

                    if val is None:
                        has_invalid_column = True
                        break

                    # keep the start and end tick
                    if field == "timestamp":
                        if not self._is_starttime_changed:
                            self._is_starttime_changed = True
                            self._starttime = val
                        else:
                            self._starttime = min(self._starttime, val)

                        self._endtime =max(val, self._endtime)

                field_index += 1

            if not has_invalid_column:
                # convert item into bytes buffer, and write to file
                self._meta.item_to_bytes(values, buffer)    
                self._output_fp.write(buffer)

                # update header fields for final update
                self._item_count += 1
                self._data_size += self._item_size