def main(config=None): """Main function for controling scoring. Config, if used should be a string containing a filename where a configuration file can be found.""" logging.basicConfig(level=logging.DEBUG) from optparse import OptionParser, make_option #define the options usage = "usage: %prog [options]" version = "%prog 0.3.3" options = [ make_option("-c","--config",metavar="config",default="config.xml",help="The configuration file name")] parser = OptionParser(usage=usage, version=version, option_list=options) #parse the options if not config: (options, arguments) = parser.parse_args() config = options.config #Take in a bunch of options describing where everything is consumer = pmmlConsumer() consumer.logger.debug("Create Reader to get Configuration") config_reader = Reader(consumer.configure, source = str(config), magicheader = False, autoattr = False) consumer.logger.debug("Read Config File") config_reader.read_once() #Create any reader or http server to read in data data_input = None run_forever = True #Check to make sure that we don't try to iterate over None if consumer.data_input_info is None: raise ConfigurationError("Data input source missing from configuration.") for item in consumer.data_input_info: if item.name == "readOnce": run_forever = False elif item.name == "batchScoring": consumer.batch_scoring = True elif data_input is not None: continue #Only process the first way that we are told to get the data. elif item.name == "fromFile" or item.name == "fromFifo": #No special treatment needed other than UniTable vs XML isUni = False filetype = None if 'type' in item.attr: filetype = item.attr['type'] if filetype == "UniTable": isUni = True data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni) elif item.name == "fromFixedRecordFile": isUni = True types = None ffnames = [] ffstarts = [] ffends = [] fftypes = [] start = 0 for field in item: ffnames.append(field.attr['name']) ffstarts.append(start) ffends.append(start + int(field.attr['length'])) start += int(field.attr['length']) if 'cr' in item.attr: ffCR = item.attr['cr'] else: ffCR = None data_input = Reader(consumer.score, source = item.attr['name'], types = None, logger = consumer.logger, magicheader = False, unitable = isUni, ffConvert = ffConfig(ffnames, ffstarts, ffends, ffCR)) elif item.name == "fromCSVFile": #We have a CSV file that needs special treatment to read in correctly isUni = True header = None sep = None types = None if 'header' in item.attr: header = item.attr['header'] if 'sep' in item.attr: sep = item.attr['sep'] if 'types' in item.attr: types = item.attr['types'] data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni, header = header, sep = sep, types = types) elif item.name == "fromStandardInput": isUni = False filetype = None if 'type' in item.attr: filetype = item.attr['type'] if filetype == "UniTable": isUni = True data_input = Reader(consumer.score, source = "-", logger = consumer.logger, magicheader = False, unitable = isUni) elif item.name == "fromHTTP": #get the stuff we need to setup the server input_url = item.attr['url'] input_port = int(item.attr['port']) datatype = None if 'type' in item.attr: datatype = item.attr['type'] if datatype == "UniTable": callback = consumer.score_http_uni else: callback = consumer.score_http_xml #Create the server data_input = HTTPInterfaceServer(('',input_port), logger = consumer.logger) #Add the callback data_input.register_callback(input_url, callback) else: #Not recognized consumer.logger.warning("Element %s is not a recognized child element of inputData, ignoring." % (item.name)) if data_input is None: raise ConfigurationError("Unable to determine data input source.") consumer.logger.debug("Initialize model") #Initalize the model #this is after the data information is input so that batch scoring may be faster consumer.initalize_model() consumer.logger.warning("Ready to score") #Start scoring data if consumer.batch_scoring: consumer.logger.debug("Batch Scoring") if isinstance(data_input, Reader): data_input.read_once() report = consumer.format_results(consumer.model.batchScore()) if consumer.output_filename: out = open(consumer.output_filename, 'w') consumer.output_report_header(file_handle = out) out.write(report) consumer.output_report_footer(file_handle = out) out.close() elif run_forever: consumer.logger.debug("Run Forever") if isinstance(data_input, Reader): consumer.output_report_header() data_input.read_forever() consumer.output_report_footer() elif isinstance(data_input, HTTPServer): data_input.serve_forever() else: print "Reading data failed." else: #just read once consumer.logger.debug("Run Once") if isinstance(data_input, Reader): consumer.output_report_header() data_input.read_once() consumer.output_report_footer() elif isinstance(data_input, HTTPServer): data_input.handle_request() else: print "Reading data failed."
class DataStreamer: """Contains a queue of data. Its internal read function uses threading. Usage: dataStreamer = DataStreamer(config_options) # Get data_fields from the PMML model (assigned in this example). data_fields = {'field1':int, 'field2':str} # If reading from a source handle: dataStreamer.initialize() for record in dataStreamer: # process the record # Else if in interactive mode, add data using 'enqueue'. # (In a thread if the data must be buffered...) dataStreamer.enqueue(data) Public methods: get, next, and __iter__ enqueue(self, dictionary) initialize(self) # Can only call this once. Internal methods: _read(self) _unitableCallback(self, uni_record) _xmlCallback(self, native_element) Data Members: _runOptions(NameSpace; contains initialization arguments) _logger (Logger) _queue (Queue): Queue of unprocessed, but read, data elements. These should be either dictionaries or UniTables. _reader (Either None, a Reader, or HTTPInterfaceServer) _thread (Thread): Thread that will run the reader. _values (dictionary): The most recent row of data from _queue. """ def __init__(self, fromHTTP=False, interactive=False, isXML=True, isCSV=False, runForever=False, maxsize=0, filename=None, **kwargs): """Set up the reading function and queue for the DataStreamer. DataStreamer's constructor is typically invoked by calling getDataStreamer(config_options), defined below. Error checking for appropriate configuration settings, and for sufficient contents in **kwargs is presumed to be done during XSD validation. The reason this initialization function is separate is to allow an advanced user to call the streamer from a script and bypass having to make an XML object containing configuration settings. Arguments: fromHTTP (boolean; default False): If True, the reader will be an HTTPInterfaceServer. interactive (boolean; default False): If True, the reader will be None and the user will push data to the queue to score using self.enqueue(self, dictionary) in which dictionary is a dictionary or a UniRecord; a row in a UniTable. isXML (boolean; default False): If True, the reader will process the input stream as XML. runForever (boolean; default False): If True, run forever. Otherwise read all data and then exit. maxsize (integer; default 0): The maximum number of objects allowed in self.queue. If zero, the Queue can be arbitrarily long. **kwargs (arguments for the Reader) """ self._runOptions =\ NameSpace( fromHTTP=fromHTTP, interactive=interactive, isXML=isXML, runForever=runForever) self._fileList = filename # None or else will become a list... self.currentFileNumber = 0 self._logger = logging.getLogger() self._metadata = logging.getLogger('metadata') self._thread = None self._values = None self._queue = Queue.Queue(maxsize) callback = self._xmlCallback if isXML else self._unitableCallback if interactive: self._reader = None elif fromHTTP: def http_callback(data): wrapper = StringIO.StringIO(data) rdr =\ Reader(callback, source=wrapper, logger=self._logger, magicheader=False, unitable=not isXML, wholeUniTable=not isXML) pipe = rdr.new_pipe() try: result = rdr.feed_pipe(None, pipe) except: raise IOError("Problem reading data over HTTP.") return result self._reader =\ HTTPInterfaceServer( ('', kwargs['port']), logger=logging.getLogger('')) self._reader.register_callback(kwargs['url'], http_callback) self._reader.isCSV = isCSV else: if filename == '-': self._fileList = ['-'] else: import glob self._fileList = glob.glob(filename) self._fileList.sort() self._fileList.reverse() if len(self._fileList) == 0: raise RuntimeError, "No Data Input files matched %s" % filename self._reader = Reader(callback, unitable=not isXML, wholeUniTable=not isXML, **kwargs) self._reader.source = self._fileList.pop() self._reader.isCSV = isCSV def enqueue(self, dictionary): """Add the dictionary (or UniTable) to the queue. Arguments: dictionary format is {field1:value1, field2:value2} """ try: self._queue.put(dictionary, timeout=0.1) except Queue.Full: self._logger.error("Data stream queue dropped:%s" % dictionary) def get(self, field): """Return the element's value or MISSING. Arguments: field (string): The name of a field in the dictionary/UniRecord * MISSING means the value is absent. """ if self._values is None: return MISSING if field not in self._values.keys(): self._logger.debug("Data not found for field: %s" % field) return MISSING output = self._values[field] if isinstance(output, float) and \ (numpy.isnan(output) or numpy.isinf(output)): return INVALID return output def __iter__(self): return self def next(self): logDebug = self._logger.getEffectiveLevel() <= logging.DEBUG if self._runOptions.isXML: self._values = None # reset in order to get the next item elif self._values is not None: try: # Iterate over the UniTable self._values = self._values.next() if logDebug: self._logger.debug("This record: %s" % self._values) return self._values except StopIteration: self._values = None # keep going; try to get the next UniTable try: self._values = self._queue.get(timeout=0.1) self._queue.task_done() except Queue.Empty: while self._values is None: if self._thread and self._thread.isAlive(): # If the Reader thread is still going, block until # another result comes. self._values = self._queue.get() self._queue.task_done() sleep(0) else: # Otherwise reset my thread. self._thread = None # Step to the next file, if it exists. if len(self._fileList) > 0: self._reader.source = self._fileList.pop() self.currentFileNumber += 1 self.initialize() else: raise StopIteration if not self._runOptions.isXML and self._values is not None: # Step into the UniTable self._values = self._values[0] if logDebug: self._logger.debug("This record: %s" % self._values) return self._values return self._values def _read(self): """The thread callback for enqueueing the data. When reading forever from a file handle (per the XSD, can only read forever FromHTTP or FromFifo) loop continuously around single reads. """ if self._runOptions.interactive: self._logger.error( "DataStreamer._read is not for interactive mode. "+\ "Instead use DataStreamer.enqueue.") return if self._runOptions.fromHTTP: self._reader.serve_forever() return if self._runOptions.runForever: while True: try: self._reader.read_forever() except KeyboardInterrupt: self._logger.error("Keyboard Interrupt.") raise except: self._logger.error("error reading data: %s" % sys.exc_info()[0]) sleep(0) else: self._metadata.startTiming('Time Reading Data') self._reader.read_once() self._metadata.stopTiming('Time Reading Data') def initialize(self): """Start receiving/reading information and posting to my queue. Launch a thread with target method to start reading from the data source. The method depends on the run options set on initialization. """ if self._runOptions.interactive: self._logger.error( "DataStreamer.initialize is not for interactive mode. "+\ "Instead use DataStreamer.enqueue.") return if self._thread: # Current implementation is to only read the source once. self._logger.error( "DataStream streaming invoked again on the same source.") return self._thread = threading.Thread(target=self._read) # Don't quit Python until after the thread is finished running. self._thread.daemon = False self._thread.start() def _unitableCallback(self, uni_table): self.enqueue(uni_table) def _xmlCallback(self, native_element): for row in native_element: obj = dict([(str(k), row.attr[k]) for k in row.attr]) self.enqueue(obj)
def main(config, outfile=None, port=None): """Main function for controling scoring. Config, if used should be a string containing a filename where a configuration file can be found.""" #Read in a config file with a bunch of options describing where everything is consumer = pmmlConsumer() #The following two logging statements are worse than useless because # they will cause 'No handlers could be found for logger "consumer"' # to be printed because we set up the logging handler while we're reading # the config file which happens at the end of this section. #consumer.logger.debug("Create Reader to get Configuration") config_reader = Reader(consumer.configure, source = str(config), magicheader = False, autoattr = False) #consumer.logger.debug("Read Config File") config_reader.read_once() #Overwrite the out file from the config file with the command line option if it was present. if outfile: consumer.output_filename = outfile #Create any reader or http server to read in data data_input = None run_forever = True run_daemon = False script_input = False #Check to make sure that we don't try to iterate over None if consumer.data_input_info is None: raise ConfigurationError("Data input source missing from configuration.") for item in consumer.data_input_info: if item.name == "readOnce": run_forever = False elif item.name == "batchScoring": consumer.batch_scoring = True elif item.name == "daemon": run_daemon = True elif data_input is not None: continue #Only process the first way that we are told to get the data. elif item.name == "fromFile" or item.name == "fromFifo": #No special treatment needed other than UniTable vs XML isUni = False filetype = None if 'type' in item.attr: filetype = item.attr['type'] if filetype == "UniTable": isUni = True data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni, framing='EOF') elif item.name == "fromFixedRecordFile": isUni = True types = None ffnames = [] ffstarts = [] ffends = [] fftypes = [] start = 0 for field in item: ffnames.append(field.attr['name']) ffstarts.append(start) ffends.append(start + int(field.attr['length'])) start += int(field.attr['length']) if 'cr' in item.attr: ffCR = item.attr['cr'] else: ffCR = None data_input = Reader(consumer.score, source = item.attr['name'], types = None, logger = consumer.logger, magicheader = False, unitable = isUni, ffConvert = ffConfig(ffnames, ffstarts, ffends, ffCR)) elif item.name == "fromCSVFile": #We have a CSV file that needs special treatment to read in correctly isUni = True header = None sep = None types = None if 'header' in item.attr: header = item.attr['header'] if 'sep' in item.attr: sep = item.attr['sep'] if 'types' in item.attr: types = item.attr['types'] data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni, header = header, sep = sep, types = types, framing = 'EOF') elif item.name == "fromStandardInput": isUni = False filetype = None sep = None types = None framing = 'EOF' if 'sep' in item.attr: sep = item.attr['sep'] if 'types' in item.attr: types = item.attr['types'] if 'type' in item.attr: filetype = item.attr['type'] if filetype == "UniTable": isUni = True if 'framing' in item.attr: framing = item.attr['framing'] consumer.logger.debug('...Test') data_input = Reader(consumer.score, source = "-", logger = consumer.logger, magicheader = False, unitable = isUni, sep = sep, types = types, framing = framing) elif item.name == "fromHTTP": #get the stuff we need to setup the server input_url = item.attr['url'] if port: input_port = int(port) else: input_port = int(item.attr['port']) datatype = None if 'type' in item.attr: datatype = item.attr['type'] if datatype == "UniTable": callback = consumer.score_http_uni else: callback = consumer.score_http_xml #Create the server data_input = HTTPInterfaceServer(('',input_port), logger = consumer.logger) #Add the callback data_input.register_callback(input_url, callback) elif item.name == "eventBased": script_input = True data_input = False #Dummy value to get past a check for None later. else: #Not recognized consumer.logger.debug("Element %s is not a recognized child element of inputData, ignoring." % (item.name)) #TODO: ??? What does the following comment refer to? #If summary data is being requested, set it up if data_input is None: #We made it through the config information without finding a data input source. raise ConfigurationError("Unable to determine data input source.") consumer.logger.debug("Initialize model") #Initialize the model #TODO: ??? What does the following comment refer to? #this is after the data information is input so that batch scoring may be faster consumer.initialize_model() if script_input: #Another script has called main, return the consumer so it can handle how score is called. return consumer consumer.logger.warning("Ready to score") #Start scoring data if consumer.metadata: # By default, for now, enable collection of # metadata by data reader and model (consumer general metadata # is enabled earlier). data_input.enableMetaDataCollection() consumer.model.enableMetaDataCollection() if consumer.batch_scoring: if consumer.metadata: consumer.metadata.log.info('Batch Scoring -One Score Per Segment\n') consumer.logger.debug("Batch Scoring") if isinstance(data_input, Reader): data_input.read_once() report = consumer.format_results(consumer.model.batchScore()) if consumer.output_filename: consumer.output_report_header(file_handle = consumer.out) consumer.out.write(report) consumer.output_report_footer(file_handle = consumer.out) consumer.out.close() elif run_forever: if consumer.metadata: consumer.metadata.log.info('Run Forever - One Score Per Event') consumer.logger.debug("Run Forever") if isinstance(data_input, Reader): consumer.output_report_header() data_input.read_forever() consumer.output_report_footer(consumer.out) elif isinstance(data_input, HTTPServer): data_input.serve_forever() else: consumer.logger.critical("Reading data failed.") else: #just read once finished = False while not finished: if consumer.metadata is not None: consumer.metadata.log.info('Run Once - One Score Per Event') consumer.metadata.log.info('Start at %s'%datetime.datetime.now().isoformat()) consumer.logger.debug("Run Once") if isinstance(data_input, Reader): consumer.output_report_header() data_input.read_once() consumer.output_report_footer() elif isinstance(data_input, HTTPServer): data_input.handle_request() else: consumer.logger.critical("Reading data failed.") if consumer.metadata: consumer.metadata.log.info('End at %s'%datetime.datetime.now().isoformat()) if run_daemon: signal.signal(signal.SIGALRM, daemonRestartHandler) signal.signal(signal.SIGUSR1, daemonRestartHandler) signal.pause() # unix only finished = False else: finished = True if consumer.metadata: consumer.metadata['Stacksize after Scoring'] = ptools.stacksize() consumer.metadata['Resident Memory after Scoring'] = ptools.resident()/1e+9 #Gb consumer.metadata['Memory after Scoring'] = ptools.memory()/1e+9 #Gb consumer.metadata.collected['DataInput'] = data_input.getMetaData() #consumer.metadata.collected['Scoring'] = consumer.metadata.getMetaData() consumer.metadata.collected['Scoring'] = consumer.getMetaData() consumer.metadata.collected[''] = consumer.model.getMetaData() consumer.metadata.report()
def main(config=None): """Main function for controling scoring. Config, if used should be a string containing a filename where a configuration file can be found.""" logging.basicConfig(level=logging.DEBUG) from optparse import OptionParser, make_option #define the options usage = "usage: %prog [options]" version = "%prog 0.3.3" options = [ make_option("-c", "--config", metavar="config", default="config.xml", help="The configuration file name") ] parser = OptionParser(usage=usage, version=version, option_list=options) #parse the options if not config: (options, arguments) = parser.parse_args() config = options.config #Take in a bunch of options describing where everything is consumer = pmmlConsumer() consumer.logger.debug("Create Reader to get Configuration") config_reader = Reader(consumer.configure, source=str(config), magicheader=False, autoattr=False) consumer.logger.debug("Read Config File") config_reader.read_once() #Create any reader or http server to read in data data_input = None run_forever = True #Check to make sure that we don't try to iterate over None if consumer.data_input_info is None: raise ConfigurationError( "Data input source missing from configuration.") for item in consumer.data_input_info: if item.name == "readOnce": run_forever = False elif item.name == "batchScoring": consumer.batch_scoring = True elif data_input is not None: continue #Only process the first way that we are told to get the data. elif item.name == "fromFile" or item.name == "fromFifo": #No special treatment needed other than UniTable vs XML isUni = False filetype = None if 'type' in item.attr: filetype = item.attr['type'] if filetype == "UniTable": isUni = True data_input = Reader(consumer.score, source=item.attr['name'], logger=consumer.logger, magicheader=False, unitable=isUni) elif item.name == "fromFixedRecordFile": isUni = True types = None ffnames = [] ffstarts = [] ffends = [] fftypes = [] start = 0 for field in item: ffnames.append(field.attr['name']) ffstarts.append(start) ffends.append(start + int(field.attr['length'])) start += int(field.attr['length']) if 'cr' in item.attr: ffCR = item.attr['cr'] else: ffCR = None data_input = Reader(consumer.score, source=item.attr['name'], types=None, logger=consumer.logger, magicheader=False, unitable=isUni, ffConvert=ffConfig(ffnames, ffstarts, ffends, ffCR)) elif item.name == "fromCSVFile": #We have a CSV file that needs special treatment to read in correctly isUni = True header = None sep = None types = None if 'header' in item.attr: header = item.attr['header'] if 'sep' in item.attr: sep = item.attr['sep'] if 'types' in item.attr: types = item.attr['types'] data_input = Reader(consumer.score, source=item.attr['name'], logger=consumer.logger, magicheader=False, unitable=isUni, header=header, sep=sep, types=types) elif item.name == "fromStandardInput": isUni = False filetype = None if 'type' in item.attr: filetype = item.attr['type'] if filetype == "UniTable": isUni = True data_input = Reader(consumer.score, source="-", logger=consumer.logger, magicheader=False, unitable=isUni) elif item.name == "fromHTTP": #get the stuff we need to setup the server input_url = item.attr['url'] input_port = int(item.attr['port']) datatype = None if 'type' in item.attr: datatype = item.attr['type'] if datatype == "UniTable": callback = consumer.score_http_uni else: callback = consumer.score_http_xml #Create the server data_input = HTTPInterfaceServer(('', input_port), logger=consumer.logger) #Add the callback data_input.register_callback(input_url, callback) else: #Not recognized consumer.logger.warning( "Element %s is not a recognized child element of inputData, ignoring." % (item.name)) if data_input is None: raise ConfigurationError("Unable to determine data input source.") consumer.logger.debug("Initialize model") #Initalize the model #this is after the data information is input so that batch scoring may be faster consumer.initalize_model() consumer.logger.warning("Ready to score") #Start scoring data if consumer.batch_scoring: consumer.logger.debug("Batch Scoring") if isinstance(data_input, Reader): data_input.read_once() report = consumer.format_results(consumer.model.batchScore()) if consumer.output_filename: out = open(consumer.output_filename, 'w') consumer.output_report_header(file_handle=out) out.write(report) consumer.output_report_footer(file_handle=out) out.close() elif run_forever: consumer.logger.debug("Run Forever") if isinstance(data_input, Reader): consumer.output_report_header() data_input.read_forever() consumer.output_report_footer() elif isinstance(data_input, HTTPServer): data_input.serve_forever() else: print "Reading data failed." else: #just read once consumer.logger.debug("Run Once") if isinstance(data_input, Reader): consumer.output_report_header() data_input.read_once() consumer.output_report_footer() elif isinstance(data_input, HTTPServer): data_input.handle_request() else: print "Reading data failed."
class DataStreamer: """Contains a queue of data. Its internal read function uses threading. Usage: dataStreamer = DataStreamer(config_options) # Get data_fields from the PMML model (assigned in this example). data_fields = {'field1':int, 'field2':str} # If reading from a source handle: dataStreamer.initialize() for record in dataStreamer: # process the record # Else if in interactive mode, add data using 'enqueue'. # (In a thread if the data must be buffered...) dataStreamer.enqueue(data) Public methods: get, next, and __iter__ enqueue(self, dictionary) initialize(self) # Can only call this once. Internal methods: _read(self) _unitableCallback(self, uni_record) _xmlCallback(self, native_element) Data Members: _runOptions(NameSpace; contains initialization arguments) _logger (Logger) _queue (Queue): Queue of unprocessed, but read, data elements. These should be either dictionaries or UniTables. _reader (Either None, a Reader, or HTTPInterfaceServer) _thread (Thread): Thread that will run the reader. _values (dictionary): The most recent row of data from _queue. """ def __init__( self, fromHTTP=False, interactive=False, isXML=True, isCSV=False, runForever=False, maxsize=0, filename=None, **kwargs): """Set up the reading function and queue for the DataStreamer. DataStreamer's constructor is typically invoked by calling getDataStreamer(config_options), defined below. Error checking for appropriate configuration settings, and for sufficient contents in **kwargs is presumed to be done during XSD validation. The reason this initialization function is separate is to allow an advanced user to call the streamer from a script and bypass having to make an XML object containing configuration settings. Arguments: fromHTTP (boolean; default False): If True, the reader will be an HTTPInterfaceServer. interactive (boolean; default False): If True, the reader will be None and the user will push data to the queue to score using self.enqueue(self, dictionary) in which dictionary is a dictionary or a UniRecord; a row in a UniTable. isXML (boolean; default False): If True, the reader will process the input stream as XML. runForever (boolean; default False): If True, run forever. Otherwise read all data and then exit. maxsize (integer; default 0): The maximum number of objects allowed in self.queue. If zero, the Queue can be arbitrarily long. **kwargs (arguments for the Reader) """ self._runOptions =\ NameSpace( fromHTTP=fromHTTP, interactive=interactive, isXML=isXML, runForever=runForever) self._fileList = filename # None or else will become a list... self.currentFileNumber = 0 self._logger = logging.getLogger() self._metadata = logging.getLogger('metadata') self._thread = None self._values = None self._queue = Queue.Queue(maxsize) callback = self._xmlCallback if isXML else self._unitableCallback if interactive: self._reader = None elif fromHTTP: def http_callback(data): wrapper = StringIO.StringIO(data) rdr =\ Reader(callback, source=wrapper, logger=self._logger, magicheader=False, unitable=not isXML, wholeUniTable=not isXML) pipe = rdr.new_pipe() try: result = rdr.feed_pipe(None, pipe) except: raise IOError("Problem reading data over HTTP.") return result self._reader =\ HTTPInterfaceServer( ('', kwargs['port']), logger=logging.getLogger('')) self._reader.register_callback(kwargs['url'], http_callback) self._reader.isCSV = isCSV else: if filename == '-': self._fileList = ['-'] else: import glob self._fileList = glob.glob(filename) self._fileList.sort() self._fileList.reverse() if len(self._fileList) == 0: raise RuntimeError, "No Data Input files matched %s" % filename self._reader = Reader(callback, unitable=not isXML, wholeUniTable=not isXML, **kwargs) self._reader.source = self._fileList.pop() self._reader.isCSV = isCSV def enqueue(self, dictionary): """Add the dictionary (or UniTable) to the queue. Arguments: dictionary format is {field1:value1, field2:value2} """ try: self._queue.put(dictionary, timeout=0.1) except Queue.Full: self._logger.error("Data stream queue dropped:%s" % dictionary) def get(self, field): """Return the element's value or MISSING. Arguments: field (string): The name of a field in the dictionary/UniRecord * MISSING means the value is absent. """ if self._values is None: return MISSING if field not in self._values.keys(): self._logger.debug("Data not found for field: %s" % field) return MISSING output = self._values[field] if isinstance(output, float) and \ (numpy.isnan(output) or numpy.isinf(output)): return INVALID return output def __iter__(self): return self def next(self): logDebug = self._logger.getEffectiveLevel() <= logging.DEBUG if self._runOptions.isXML: self._values = None # reset in order to get the next item elif self._values is not None: try: # Iterate over the UniTable self._values = self._values.next() if logDebug: self._logger.debug("This record: %s" %self._values) return self._values except StopIteration: self._values = None # keep going; try to get the next UniTable try: self._values = self._queue.get(timeout=0.1) self._queue.task_done() except Queue.Empty: while self._values is None: if self._thread and self._thread.isAlive(): # If the Reader thread is still going, block until # another result comes. self._values = self._queue.get() self._queue.task_done() sleep(0) else: # Otherwise reset my thread. self._thread = None # Step to the next file, if it exists. if len(self._fileList) > 0: self._reader.source = self._fileList.pop() self.currentFileNumber += 1 self.initialize() else: raise StopIteration if not self._runOptions.isXML and self._values is not None: # Step into the UniTable self._values = self._values[0] if logDebug: self._logger.debug("This record: %s" %self._values) return self._values return self._values def _read(self): """The thread callback for enqueueing the data. When reading forever from a file handle (per the XSD, can only read forever FromHTTP or FromFifo) loop continuously around single reads. """ if self._runOptions.interactive: self._logger.error( "DataStreamer._read is not for interactive mode. "+\ "Instead use DataStreamer.enqueue.") return if self._runOptions.fromHTTP: self._reader.serve_forever() return if self._runOptions.runForever: while True: try: self._reader.read_forever() except KeyboardInterrupt: self._logger.error("Keyboard Interrupt.") raise except: self._logger.error( "error reading data: %s" % sys.exc_info()[0]) sleep(0) else: self._metadata.startTiming('Time Reading Data') self._reader.read_once() self._metadata.stopTiming('Time Reading Data') def initialize(self): """Start receiving/reading information and posting to my queue. Launch a thread with target method to start reading from the data source. The method depends on the run options set on initialization. """ if self._runOptions.interactive: self._logger.error( "DataStreamer.initialize is not for interactive mode. "+\ "Instead use DataStreamer.enqueue.") return if self._thread: # Current implementation is to only read the source once. self._logger.error("DataStream streaming invoked again on the same source.") return self._thread = threading.Thread(target=self._read) # Don't quit Python until after the thread is finished running. self._thread.daemon = False self._thread.start() def _unitableCallback(self, uni_table): self.enqueue(uni_table) def _xmlCallback(self, native_element): for row in native_element: obj = dict([(str(k), row.attr[k]) for k in row.attr]) self.enqueue(obj)