def text_file_record_generator_creator(filename: str, compression: str = None ) -> RecordGeneratorType: ''' A helper function that returns a generator that we can use to iterate through lines in the input file Args: filename: The input filename compression: The compression type of the input file or None if its not compressed ''' if compression is None: compression = infer_compression(filename) if compression is None or compression == '': return open(filename, 'r') if compression == 'gzip': import gzip return gzip.open(filename, 'r') if compression == 'bz2': import bz2 return bz2.BZ2File(filename, 'r') if compression == 'zip': import zipfile zf = zipfile.ZipFile(filename, mode='r', compression=zipfile.ZIP_DEFLATED) zip_infos = zf.infolist() zip_names = [zi.filename for zi in zip_infos if not zi.is_dir()] if len(zip_names) == 0: raise ValueError(f'zero files found in ZIP file {filename}') return zf.open(zip_names.pop()) if compression == 'xz': import lzma return lzma.LZMAFile(filename, 'r') raise ValueError( f'unrecognized compression: {compression} for file: {filename}')
def __call__(self) -> Sequence[Tuple[str, str]]: ''' Returns: All matching filenames ''' files = glob.glob(self.path) if not len(files): raise Exception( f'no matching files found with pattern: {self.path}') if self.include_pattern: files = [file for file in files if self.include_pattern in file] if self.exclude_pattern: files = [ file for file in files if self.exclude_pattern not in file ] if not len(files): raise Exception( f'no matching files for: {self.path} including: {self.include_pattern} excluding: {self.exclude_pattern}' ) compression_tmp = [infer_compression(filename) for filename in files] compression: Sequence[str] = list( map(lambda x: '' if x is None else x, compression_tmp)) return list(zip(files, compression))
def process_marketdata_file( input_filename: str, compression: str, output_file_prefix_mapper: OutputFilePrefixMapperType, record_parser_creator: RecordParserCreatorType, aggregator_creator: AggregatorCreatorType, line_filter: LineFilterType = None, base_date_mapper: BaseDateMapperType = None, file_processor_creator: FileProcessorCreatorType = create_text_file_processor, header_parser_creator: HeaderParserCreatorType = lambda record_generator_creator: TextHeaderParser(record_generator_creator), header_record_generator: RecordGeneratorCreatorType = text_file_record_generator_creator, record_generator: RecordGeneratorType = TextFileDecompressor(), bad_line_handler: BadLineHandlerType = PrintBadLineHandler(), record_filter: RecordFilterType = None, missing_data_handler: MissingDataHandlerType = PriceQtyMissingDataHandler(), writer_creator: WriterCreatorType = None) -> None: ''' Processes a single market data file Args: input_filename: File name (including path) to process compression: Compression type for the input file. If not set, we try to infer the compression type from the filename. output_file_prefix_mapper: A function that takes an input filename and returns the corresponding output filename we want record_parser_creator: A function that takes a date and a list of column names and returns a function that can take a list of fields and return a subclass of Record aggregator_creator: A function that takes a writer creator and returns a list of Aggregators line_filter: A function that takes a line and decides whether we want to keep it or discard it. Defaults to None base_date_mapper: A function that takes an input filename and returns the date implied by the filename, represented as millis since epoch. file_processor_creator: A function that returns an object that we can use to iterate through lines in a file. Defaults to helper function :obj:`create_text_file_processor` header_record_generator: A function that takes a filename and compression and returns a generator that we can use to get column headers record_generator: A function that takes a filename and compression and returns a generator that we can use to iterate through lines in the file bad_line_handler (optional): A function that takes a line that we could not parse, and either parses it or does something else like recording debugging info, or stopping the processing by raising an exception. Defaults to helper function :obj:`PrintBadLineHandler` record_filter (optional): A function that takes a parsed TradeRecord, QuoteRecord, OpenInterestRecord or OtherRecord and decides whether we want to keep it or discard it. Defaults to None missing_data_handler (optional): A function that takes a parsed TradeRecord, QuoteRecord, OpenInterestRecord or OtherRecord, and decides deals with any data that is missing in those records. For example, 0 for bid could be replaced by NAN. Defaults to helper function: :obj:`price_qty_missing_data_handler` writer_creator (optional): A function that takes an output_file_prefix, schema, whether to create a batch id file, and batch_size and returns a subclass of :obj:`Writer`. Defaults to :obj:`HDF5WriterCreatorr` ''' if writer_creator is None: writer_creator = HDF5WriterCreator( output_file_prefix_mapper(input_filename), ' ') output_file_prefix = output_file_prefix_mapper(input_filename) base_date = 0 if base_date_mapper is not None: base_date = base_date_mapper(input_filename) header_parser = header_parser_creator(header_record_generator) _logger.info(f'starting file: {input_filename}') if compression == "": compression_tmp = infer_compression(input_filename) compression = '' if compression_tmp is None else compression_tmp # In C++ don't want virtual function with default argument, so don't allow it here headers = header_parser( input_filename, compression) # type: ignore # cannot be None at this point. record_parser = record_parser_creator(base_date, headers) aggregators = aggregator_creator(writer_creator) file_processor = file_processor_creator(record_generator, line_filter, record_parser, bad_line_handler, record_filter, missing_data_handler, aggregators) start = timer() lines_processed = file_processor(input_filename, compression) end = timer() duration = round((end - start) * 1000) touch(output_file_prefix + '.done') _logger.info( f"processed: {input_filename} {lines_processed} lines in {duration} milliseconds" )