Python infer_compression Examples, pyqstrat.pq_utils.infer_compression Python Examples

Example #1

0

Show file

File: marketdata_processor.py Project: zhangjielun1994/pyqstrat

def text_file_record_generator_creator(filename: str,
                                       compression: str = None
                                       ) -> RecordGeneratorType:
    '''
    A helper function that returns a generator that we can use to iterate through lines in the input file
    Args:
        filename: The input filename
        compression: The compression type of the input file or None if its not compressed    
    '''
    if compression is None: compression = infer_compression(filename)
    if compression is None or compression == '':
        return open(filename, 'r')
    if compression == 'gzip':
        import gzip
        return gzip.open(filename, 'r')
    if compression == 'bz2':
        import bz2
        return bz2.BZ2File(filename, 'r')
    if compression == 'zip':
        import zipfile
        zf = zipfile.ZipFile(filename,
                             mode='r',
                             compression=zipfile.ZIP_DEFLATED)
        zip_infos = zf.infolist()
        zip_names = [zi.filename for zi in zip_infos if not zi.is_dir()]
        if len(zip_names) == 0:
            raise ValueError(f'zero files found in ZIP file {filename}')
        return zf.open(zip_names.pop())
    if compression == 'xz':
        import lzma
        return lzma.LZMAFile(filename, 'r')
    raise ValueError(
        f'unrecognized compression: {compression} for file: {filename}')

Example #2

0

Show file

File: marketdata_processor.py Project: zhangjielun1994/pyqstrat

    def __call__(self) -> Sequence[Tuple[str, str]]:
        '''
        Returns:
            All matching filenames
        '''
        files = glob.glob(self.path)
        if not len(files):
            raise Exception(
                f'no matching files found with pattern: {self.path}')
        if self.include_pattern:
            files = [file for file in files if self.include_pattern in file]
        if self.exclude_pattern:
            files = [
                file for file in files if self.exclude_pattern not in file
            ]
        if not len(files):
            raise Exception(
                f'no matching files for: {self.path} including: {self.include_pattern} excluding: {self.exclude_pattern}'
            )
        compression_tmp = [infer_compression(filename) for filename in files]
        compression: Sequence[str] = list(
            map(lambda x: '' if x is None else x, compression_tmp))

        return list(zip(files, compression))

Example #3

0

Show file

File: marketdata_processor.py Project: zhangjielun1994/pyqstrat

def process_marketdata_file(
        input_filename: str,
        compression: str,
        output_file_prefix_mapper: OutputFilePrefixMapperType,
        record_parser_creator: RecordParserCreatorType,
        aggregator_creator: AggregatorCreatorType,
        line_filter: LineFilterType = None,
        base_date_mapper: BaseDateMapperType = None,
        file_processor_creator:
    FileProcessorCreatorType = create_text_file_processor,
        header_parser_creator: HeaderParserCreatorType = lambda
    record_generator_creator: TextHeaderParser(record_generator_creator),
        header_record_generator:
    RecordGeneratorCreatorType = text_file_record_generator_creator,
        record_generator: RecordGeneratorType = TextFileDecompressor(),
        bad_line_handler: BadLineHandlerType = PrintBadLineHandler(),
        record_filter: RecordFilterType = None,
        missing_data_handler:
    MissingDataHandlerType = PriceQtyMissingDataHandler(),
        writer_creator: WriterCreatorType = None) -> None:
    '''
    Processes a single market data file
    
    Args:
        input_filename: File name (including path) to process
        compression: Compression type for the input file.  If not set, we try to infer the compression type from the filename.
        output_file_prefix_mapper: A function that takes an input filename and returns the corresponding output filename we want
        record_parser_creator:  A function that takes a date and a list of column names and returns a 
            function that can take a list of fields and return a subclass of Record
        aggregator_creator: A function that takes a writer creator and returns a list of Aggregators
        line_filter: A function that takes a line and decides whether we want to keep it or discard it.  Defaults to None
        base_date_mapper: A function that takes an input filename and returns the date implied by the filename, 
            represented as millis since epoch.
        file_processor_creator: A function that returns an object that we can use to iterate through lines in a file.  Defaults to
            helper function :obj:`create_text_file_processor`
        header_record_generator: A function that takes a filename and compression and returns a generator that we can use to get column headers
        record_generator: A function that takes a filename and compression and returns a generator that we 
            can use to iterate through lines in the file
        bad_line_handler (optional): A function that takes a line that we could not parse, and either parses it or does something else
            like recording debugging info, or stopping the processing by raising an exception.  Defaults to helper function 
            :obj:`PrintBadLineHandler`
        record_filter (optional): A function that takes a parsed TradeRecord, QuoteRecord, OpenInterestRecord or OtherRecord and decides whether we
            want to keep it or discard it.  Defaults to None
        missing_data_handler (optional):  A function that takes a parsed TradeRecord, QuoteRecord, OpenInterestRecord or OtherRecord, and decides
            deals with any data that is missing in those records.  For example, 0 for bid could be replaced by NAN.  Defaults to helper function:
            :obj:`price_qty_missing_data_handler`
        writer_creator (optional): A function that takes an output_file_prefix, schema, whether to create a batch id file, and batch_size
            and returns a subclass of :obj:`Writer`.  Defaults to :obj:`HDF5WriterCreatorr`
    '''

    if writer_creator is None:
        writer_creator = HDF5WriterCreator(
            output_file_prefix_mapper(input_filename), ' ')

    output_file_prefix = output_file_prefix_mapper(input_filename)

    base_date = 0

    if base_date_mapper is not None:
        base_date = base_date_mapper(input_filename)

    header_parser = header_parser_creator(header_record_generator)
    _logger.info(f'starting file: {input_filename}')
    if compression == "":
        compression_tmp = infer_compression(input_filename)
        compression = '' if compression_tmp is None else compression_tmp  # In C++ don't want virtual function with default argument, so don't allow it here

    headers = header_parser(
        input_filename,
        compression)  # type: ignore # cannot be None at this point.

    record_parser = record_parser_creator(base_date, headers)

    aggregators = aggregator_creator(writer_creator)

    file_processor = file_processor_creator(record_generator, line_filter,
                                            record_parser, bad_line_handler,
                                            record_filter,
                                            missing_data_handler, aggregators)

    start = timer()
    lines_processed = file_processor(input_filename, compression)
    end = timer()
    duration = round((end - start) * 1000)
    touch(output_file_prefix + '.done')
    _logger.info(
        f"processed: {input_filename} {lines_processed} lines in {duration} milliseconds"
    )