def write_summary(self): ############################################################ # Finish filling in the parse summary info and close up ############################################################ self.__parse_manager.data_start(self.__data_start_timestamp) self.__parse_manager.data_stop(self.__current_timestamp) self.__parse_manager.irrelevants(0) self.__parse_manager.processed(self.__line_number+1) self.__parse_manager.mark_stop(True) self.__h5_file.close() ParseManager.summarize_file(self.__output_path)
def parse(self, build_book = True, force = False, stop_early_at_hit=0): """ Parse the input file. There are two modes: build_book=True and build_book=False. If build_book=False, the h5 file is simply the same record data from the gz file, but stored as hdf5. If build_book=True, the hdf5 file created has book data for all matching inputs. Each symbol gets it's own dataset. The ParseManager is used to store summary information for the parse of this data. """ self.__output_path = self.__output_base + (build_book and ".h5" or "_AMD_.h5") logging.info("Parsing file %s\n\tto create %s"% (self.__input_path, self.__output_path)) if self.__output_path.exists() and not force: return if not self.__output_path.parent.exists(): os.makedirs(self.__output_path.parent) self.__h5_file = openFile(self.__output_path, mode = "w", title = "ARCA Equity Data") if not build_book: ## If not building book, then just writing out AMD data as hdf5 filters = Filters(complevel=1, complib='zlib') group = self.__h5_file.createGroup("/", '_AMD_Data_', 'Add-Modify-Delete data') table = self.__h5_file.createTable(group, 'records', ArcaRecord, "Data for "+str(self.__date), filters=filters) h5Record = table.row self.__parse_manager = ParseManager(self.__input_path, self.__h5_file) self.__parse_manager.mark_start() hit_count = 0 data_start_timestamp = None for self.__line_number, line in enumerate(gzip.open(self.__input_path, 'rb')): if stop_early_at_hit and hit_count == stop_early_at_hit: break ################################################### # Show progress periodically ################################################### if 0 == (self.__line_number % 1000000): logging.info("At %d hit count is %d on %s" % (self.__line_number, hit_count, (self.__symbols and self.__symbols or "*"))) fields = re.split(r'\s*,\s*', line) code = fields[0] record = None if code == 'A': record = AddRecord(fields, self.__start_of_date) elif code == 'D': record = DeleteRecord(fields, self.__start_of_date) elif code == 'M': record = ModifyRecord(fields, self.__start_of_date) elif code == 'I' or code == 'V': continue else: continue #raise RuntimeError("Unexpected record type '" + # code + "' at line " + str(self.__line_number) + # " of file " + self.__input_path) if self.__symbols and (not record.symbol in self.__symbols): continue else: hit_count += 1 # record the timestamp of the first record as data_start if not data_start_timestamp: data_start_timestamp = record.timestamp if build_book: self.build_books(record) else: h5Record['ts'] = record.timestamp h5Record['asc_ts'] = chicago_time_str(record.timestamp) h5Record['symbol'] = record.symbol h5Record['seq_num'] = record.seq_num h5Record['order_id'] = record.order_id h5Record['record_type'] = code h5Record['buy_sell'] = (record.is_buy and 'B' or 'S') if code != 'D': h5Record['price'] = record.price h5Record['quantity'] = record.quantity h5Record.append() if 0 == hit_count % __FLUSH_FREQ__: table.flush() books_good = True total_unchanged = 0 for symbol, builder in self.__book_builders.iteritems(): books_good = books_good and builder.summary() total_unchanged += builder.unchanged ############################################################ # Finish filling in the parse summary info and close up ############################################################ self.__parse_manager.data_start(data_start_timestamp) self.__parse_manager.data_stop(record.timestamp) self.__parse_manager.irrelevants(total_unchanged) self.__parse_manager.processed(self.__line_number+1) self.__parse_manager.mark_stop(books_good) self.__h5_file.close() ParseManager.summarize_file(self.__output_path)
def parse(self, build_book = True, force = False, stop_early_at_hit=0): """ Parse the input file. There are two modes: build_book=True and build_book=False. If build_book=False, the h5 file is simply the same record data from the gz file, but stored as hdf5. If build_book=True, the hdf5 file created has book data for all matching inputs. Each symbol gets it's own dataset. The ParseManager is used to store summary information for the parse of this data. """ self.__output_path = self.__output_base + (build_book and ".h5" or "_AMD_.h5") logging.info("Parsing file %s\n\tto create %s"% (self.__input_path, self.__output_path)) if self.__output_path.exists() and not force: print 'Error: output file already exists. Use --force to overwrite.' return if not self.__output_path.parent.exists(): os.makedirs(self.__output_path.parent) self.__h5_file = openFile(bytes(self.__output_path), mode = "w", title = "ARCA Equity Data") if not build_book: ## If not building book, then just writing out AMD data as hdf5 filters = Filters(complevel=1, complib='zlib') group = self.__h5_file.createGroup("/", '_AMD_Data_', 'Add-Modify-Delete data') table = self.__h5_file.createTable(group, 'records', ArcaRecord, "Data for "+str(self.__date), filters=filters) h5Record = table.row self.__parse_manager = ParseManager(self.__input_path, self.__h5_file) self.__parse_manager.mark_start() hit_count = 0 data_start_timestamp = None symbols_cleaned = map(lambda s: s if len(s) > 1 else ','+s+',', self.__symbols) symbol_regex = r'/\(' + string.join(symbols_cleaned, r'\|') + r'\)/p' unzip = subprocess.Popen(['gzip','-d','-c', self.__input_path], stdout=subprocess.PIPE, bufsize=-1) sed = subprocess.Popen(['sed','-n',symbol_regex], stdin=unzip.stdout, stdout=subprocess.PIPE, bufsize=-1) # print "symbol regex: {}".format(symbol_regex) # print "from sed: {}".format(sed.stdout.readline()) # print "from unzip: {}".format(unzip.stdout.readline()) print "input path: {}".format(self.__input_path) infile = csv.reader(iter(sed.stdout.readline, '')) for self.__line_number, fields in enumerate(infile): # print "reading fields: {}".format(fields) if stop_early_at_hit and hit_count == stop_early_at_hit: break ################################################### # Show progress periodically ################################################### if 0 == (self.__line_number % 1000000): logging.info("At %d hit count is %d on %s" % (self.__line_number, hit_count, (self.__symbols and self.__symbols or "*"))) code = fields[0] record = None if code == 'A': record = AddRecord(fields, self.__start_of_date) elif code == 'D': record = DeleteRecord(fields, self.__start_of_date) elif code == 'M': record = ModifyRecord(fields, self.__start_of_date) elif code == 'I' or code == 'V': continue else: continue #raise RuntimeError("Unexpected record type '" + # code + "' at line " + str(self.__line_number) + # " of file " + self.__input_path) # print "built record: {}".format(record) if self.__symbols and (not record.symbol in self.__symbols): # print "passing" continue else: hit_count += 1 # print "setting start timestamp" # record the timestamp of the first record as data_start if not data_start_timestamp: data_start_timestamp = record.timestamp if build_book: # print "calling book builder" self.build_books(record) else: h5Record['ts'] = record.timestamp h5Record['asc_ts'] = chicago_time_str(record.timestamp) h5Record['symbol'] = record.symbol h5Record['seq_num'] = record.seq_num h5Record['order_id'] = record.order_id h5Record['record_type'] = code h5Record['buy_sell'] = (record.is_buy and 'B' or 'S') if code != 'D': h5Record['price'] = record.price h5Record['quantity'] = record.quantity h5Record.append() if 0 == hit_count % __FLUSH_FREQ__: table.flush() sed.wait() books_good = True total_unchanged = 0 for symbol, builder in self.__book_builders.iteritems(): books_good = books_good and builder.summary() total_unchanged += builder.unchanged ############################################################ # Finish filling in the parse summary info and close up ############################################################ self.__parse_manager.data_start(data_start_timestamp) self.__parse_manager.data_stop(record.timestamp) self.__parse_manager.irrelevants(total_unchanged) self.__parse_manager.processed(self.__line_number+1) self.__parse_manager.mark_stop(books_good) self.__h5_file.close() ParseManager.summarize_file(bytes(self.__output_path))