Example #1
0
 def write_summary(self):
     ############################################################
     # Finish filling in the parse summary info and close up
     ############################################################
     self.__parse_manager.data_start(self.__data_start_timestamp)
     self.__parse_manager.data_stop(self.__current_timestamp)
     self.__parse_manager.irrelevants(0)
     self.__parse_manager.processed(self.__line_number+1)
     self.__parse_manager.mark_stop(True)
     self.__h5_file.close()
     ParseManager.summarize_file(self.__output_path)
Example #2
0
    def parse(self, build_book = True, force = False, stop_early_at_hit=0):
        """
        Parse the input file. There are two modes: build_book=True and
        build_book=False. If build_book=False, the h5 file is simply the same
        record data from the gz file, but stored as hdf5. If build_book=True,
        the hdf5 file created has book data for all matching inputs. Each
        symbol gets it's own dataset.

        The ParseManager is used to store summary information for the parse of
        this data.
        """
        self.__output_path = self.__output_base + (build_book and ".h5" or "_AMD_.h5")
        logging.info("Parsing file %s\n\tto create %s"% (self.__input_path, self.__output_path))
        if self.__output_path.exists() and not force:
            return
        if not self.__output_path.parent.exists():
            os.makedirs(self.__output_path.parent)
        self.__h5_file = openFile(self.__output_path, mode = "w", title = "ARCA Equity Data")
        if not build_book:
            ## If not building book, then just writing out AMD data as hdf5
            filters = Filters(complevel=1, complib='zlib')
            group = self.__h5_file.createGroup("/", '_AMD_Data_', 'Add-Modify-Delete data')
            table = self.__h5_file.createTable(group, 'records', ArcaRecord, 
                                               "Data for "+str(self.__date), filters=filters)
            h5Record = table.row

        self.__parse_manager = ParseManager(self.__input_path, self.__h5_file)
        self.__parse_manager.mark_start()

        hit_count = 0
        data_start_timestamp = None

        for self.__line_number, line in enumerate(gzip.open(self.__input_path, 'rb')):

            if stop_early_at_hit and hit_count == stop_early_at_hit:
                break 

            ###################################################
            # Show progress periodically
            ###################################################
            if 0 == (self.__line_number % 1000000):
                logging.info("At %d hit count is %d on %s" % 
                             (self.__line_number, hit_count, 
                              (self.__symbols and 
                               self.__symbols or "*")))

            fields = re.split(r'\s*,\s*', line)
            code = fields[0]
            record = None
            if code == 'A':
                record = AddRecord(fields, self.__start_of_date)
            elif code == 'D':
                record = DeleteRecord(fields, self.__start_of_date)
            elif code == 'M':
                record = ModifyRecord(fields, self.__start_of_date)
            elif code == 'I' or code == 'V':
                continue
            else:
                continue
                #raise RuntimeError("Unexpected record type '" + 
                #                   code + "' at line " + str(self.__line_number) + 
                #                   " of file " + self.__input_path)

            if self.__symbols and (not record.symbol in self.__symbols):
                continue
            else:
                hit_count += 1

                # record the timestamp of the first record as data_start
                if not data_start_timestamp:
                    data_start_timestamp = record.timestamp

                if build_book:
                    self.build_books(record)
                else:
                    h5Record['ts'] = record.timestamp
                    h5Record['asc_ts'] = chicago_time_str(record.timestamp)
                    h5Record['symbol'] = record.symbol
                    h5Record['seq_num'] = record.seq_num
                    h5Record['order_id'] = record.order_id
                    h5Record['record_type'] = code
                    h5Record['buy_sell'] = (record.is_buy and 'B' or 'S')
                    if code != 'D':
                        h5Record['price'] = record.price
                        h5Record['quantity'] = record.quantity

                    h5Record.append()

                    if 0 == hit_count % __FLUSH_FREQ__:
                        table.flush()

        books_good = True
        total_unchanged = 0
        for symbol, builder in self.__book_builders.iteritems():
            books_good = books_good and builder.summary()
            total_unchanged += builder.unchanged

        ############################################################
        # Finish filling in the parse summary info and close up
        ############################################################
        self.__parse_manager.data_start(data_start_timestamp)
        self.__parse_manager.data_stop(record.timestamp)
        self.__parse_manager.irrelevants(total_unchanged)
        self.__parse_manager.processed(self.__line_number+1)
        self.__parse_manager.mark_stop(books_good)
        self.__h5_file.close()
        ParseManager.summarize_file(self.__output_path)
Example #3
0
    def parse(self, build_book = True, force = False, stop_early_at_hit=0):
        """
        Parse the input file. There are two modes: build_book=True and
        build_book=False. If build_book=False, the h5 file is simply the same
        record data from the gz file, but stored as hdf5. If build_book=True,
        the hdf5 file created has book data for all matching inputs. Each
        symbol gets it's own dataset.

        The ParseManager is used to store summary information for the parse of
        this data.
        """
        self.__output_path = self.__output_base + (build_book and ".h5" or "_AMD_.h5")
        logging.info("Parsing file %s\n\tto create %s"% (self.__input_path, self.__output_path))
        if self.__output_path.exists() and not force:
            print 'Error: output file already exists. Use --force to overwrite.'
            return
        if not self.__output_path.parent.exists():
            os.makedirs(self.__output_path.parent)
        self.__h5_file = openFile(bytes(self.__output_path), mode = "w", title = "ARCA Equity Data")
        if not build_book:
            ## If not building book, then just writing out AMD data as hdf5
            filters = Filters(complevel=1, complib='zlib')
            group = self.__h5_file.createGroup("/", '_AMD_Data_', 'Add-Modify-Delete data')
            table = self.__h5_file.createTable(group, 'records', ArcaRecord,
                                               "Data for "+str(self.__date), filters=filters)
            h5Record = table.row

        self.__parse_manager = ParseManager(self.__input_path, self.__h5_file)
        self.__parse_manager.mark_start()

        hit_count = 0
        data_start_timestamp = None

        symbols_cleaned = map(lambda s: s if len(s) > 1 else ','+s+',', self.__symbols)
        symbol_regex = r'/\(' + string.join(symbols_cleaned, r'\|') + r'\)/p'

        unzip = subprocess.Popen(['gzip','-d','-c', self.__input_path],
                                 stdout=subprocess.PIPE,
                                 bufsize=-1)
        sed = subprocess.Popen(['sed','-n',symbol_regex],
                               stdin=unzip.stdout,
                               stdout=subprocess.PIPE,
                               bufsize=-1)

        # print "symbol regex: {}".format(symbol_regex)
        # print "from sed: {}".format(sed.stdout.readline())
        # print "from unzip: {}".format(unzip.stdout.readline())
        print "input path: {}".format(self.__input_path)

        infile = csv.reader(iter(sed.stdout.readline, ''))

        for self.__line_number, fields in enumerate(infile):
            # print "reading fields: {}".format(fields)
            if stop_early_at_hit and hit_count == stop_early_at_hit:
                break

            ###################################################
            # Show progress periodically
            ###################################################
            if 0 == (self.__line_number % 1000000):
                logging.info("At %d hit count is %d on %s" %
                             (self.__line_number, hit_count,
                              (self.__symbols and
                               self.__symbols or "*")))

            code = fields[0]
            record = None
            if code == 'A':
                record = AddRecord(fields, self.__start_of_date)
            elif code == 'D':
                record = DeleteRecord(fields, self.__start_of_date)
            elif code == 'M':
                record = ModifyRecord(fields, self.__start_of_date)
            elif code == 'I' or code == 'V':
                continue
            else:
                continue
                #raise RuntimeError("Unexpected record type '" +
                #                   code + "' at line " + str(self.__line_number) +
                #                   " of file " + self.__input_path)
            # print "built record: {}".format(record)

            if self.__symbols and (not record.symbol in self.__symbols):
                # print "passing"
                continue
            else:
                hit_count += 1
                # print "setting start timestamp"

                # record the timestamp of the first record as data_start
                if not data_start_timestamp:
                    data_start_timestamp = record.timestamp

                if build_book:
                    # print "calling book builder"
                    self.build_books(record)
                else:
                    h5Record['ts'] = record.timestamp
                    h5Record['asc_ts'] = chicago_time_str(record.timestamp)
                    h5Record['symbol'] = record.symbol
                    h5Record['seq_num'] = record.seq_num
                    h5Record['order_id'] = record.order_id
                    h5Record['record_type'] = code
                    h5Record['buy_sell'] = (record.is_buy and 'B' or 'S')
                    if code != 'D':
                        h5Record['price'] = record.price
                        h5Record['quantity'] = record.quantity

                    h5Record.append()

                    if 0 == hit_count % __FLUSH_FREQ__:
                        table.flush()

        sed.wait()

        books_good = True
        total_unchanged = 0
        for symbol, builder in self.__book_builders.iteritems():
            books_good = books_good and builder.summary()
            total_unchanged += builder.unchanged

        ############################################################
        # Finish filling in the parse summary info and close up
        ############################################################
        self.__parse_manager.data_start(data_start_timestamp)
        self.__parse_manager.data_stop(record.timestamp)
        self.__parse_manager.irrelevants(total_unchanged)
        self.__parse_manager.processed(self.__line_number+1)
        self.__parse_manager.mark_stop(books_good)
        self.__h5_file.close()
        ParseManager.summarize_file(bytes(self.__output_path))