예제 #1
0
def main(config=None):
  """Main function for controling scoring.  Config, if used should be a string containing a filename where a configuration file can be found."""
  logging.basicConfig(level=logging.DEBUG)
  
  from optparse import OptionParser, make_option
  #define the options
  usage = "usage: %prog [options]"
  version = "%prog 0.3.3"
  options = [
    make_option("-c","--config",metavar="config",default="config.xml",help="The configuration file name")]
  parser = OptionParser(usage=usage, version=version, option_list=options)
  
  #parse the options
  if not config:
    (options, arguments) = parser.parse_args()
    config = options.config
  
  #Take in a bunch of options describing where everything is
  consumer = pmmlConsumer()
  consumer.logger.debug("Create Reader to get Configuration")
  config_reader = Reader(consumer.configure, source = str(config), magicheader = False, autoattr = False)
  consumer.logger.debug("Read Config File")
  config_reader.read_once()
  
  #Create any reader or http server to read in data
  data_input = None
  run_forever = True
  
  #Check to make sure that we don't try to iterate over None
  if consumer.data_input_info is None:
    raise ConfigurationError("Data input source missing from configuration.")
  
  for item in consumer.data_input_info:
    if item.name == "readOnce":
      run_forever = False
    elif item.name == "batchScoring":
      consumer.batch_scoring = True
    elif data_input is not None:
      continue #Only process the first way that we are told to get the data.
    elif item.name == "fromFile" or item.name == "fromFifo":
      #No special treatment needed other than UniTable vs XML
      isUni = False
      filetype = None
      if 'type' in item.attr:
        filetype = item.attr['type']
      if filetype == "UniTable":
        isUni = True
      data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni)
    elif item.name == "fromFixedRecordFile":
      isUni = True
      types = None
      ffnames = []
      ffstarts = []
      ffends = []
      fftypes = []
      start = 0
      for field in item:
        ffnames.append(field.attr['name'])
        ffstarts.append(start)
        ffends.append(start + int(field.attr['length']))
        start += int(field.attr['length'])
      if 'cr' in item.attr:
        ffCR = item.attr['cr']
      else:
        ffCR = None
      data_input = Reader(consumer.score, source = item.attr['name'],
        types = None,
        logger = consumer.logger, magicheader = False, unitable = isUni, ffConvert = ffConfig(ffnames, ffstarts, ffends, ffCR))
    elif item.name == "fromCSVFile":
      #We have a CSV file that needs special treatment to read in correctly
      isUni = True
      header = None
      sep = None
      types = None
      if 'header' in item.attr:
        header = item.attr['header']
      if 'sep' in item.attr:
        sep = item.attr['sep']
      if 'types' in item.attr:
        types = item.attr['types']
      data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni, header = header, sep = sep, types = types)
    elif item.name == "fromStandardInput":
      isUni = False
      filetype = None
      if 'type' in item.attr:
        filetype = item.attr['type']
      if filetype == "UniTable":
        isUni = True
      data_input = Reader(consumer.score, source = "-", logger = consumer.logger, magicheader = False, unitable = isUni)
    elif item.name == "fromHTTP":
      #get the stuff we need to setup the server
      input_url = item.attr['url']
      input_port = int(item.attr['port'])
      datatype = None
      if 'type' in item.attr:
        datatype = item.attr['type']
      if datatype == "UniTable":
        callback = consumer.score_http_uni
      else:
        callback = consumer.score_http_xml
      
      #Create the server
      data_input = HTTPInterfaceServer(('',input_port), logger = consumer.logger)
      #Add the callback
      data_input.register_callback(input_url, callback)
    else:
      #Not recognized
      consumer.logger.warning("Element %s is not a recognized child element of inputData, ignoring." % (item.name))
    
  if data_input is None:
    raise ConfigurationError("Unable to determine data input source.")
  consumer.logger.debug("Initialize model")
  #Initalize the model
  #this is after the data information is input so that batch scoring may be faster
  consumer.initalize_model()  
  consumer.logger.warning("Ready to score")
  #Start scoring data
  if consumer.batch_scoring:
    consumer.logger.debug("Batch Scoring")
    if isinstance(data_input, Reader):
      data_input.read_once()
      report = consumer.format_results(consumer.model.batchScore())
      if consumer.output_filename:
        out = open(consumer.output_filename, 'w')
        consumer.output_report_header(file_handle = out)
        out.write(report)
        consumer.output_report_footer(file_handle = out)
        out.close()
  elif run_forever:
    consumer.logger.debug("Run Forever")
    if isinstance(data_input, Reader):
      consumer.output_report_header()
      data_input.read_forever()
      consumer.output_report_footer()
    elif isinstance(data_input, HTTPServer):
      data_input.serve_forever()
    else:
      print "Reading data failed."
  else: #just read once
    consumer.logger.debug("Run Once")
    if isinstance(data_input, Reader):
      consumer.output_report_header()
      data_input.read_once()
      consumer.output_report_footer()
    elif isinstance(data_input, HTTPServer):
      data_input.handle_request()
    else:
      print "Reading data failed."
예제 #2
0
class DataStreamer:
    """Contains a queue of data.  Its internal read function uses threading.

    Usage:

        dataStreamer = DataStreamer(config_options)
        # Get data_fields from the PMML model (assigned in this example).
        data_fields = {'field1':int, 'field2':str}

        # If reading from a source handle:
        dataStreamer.initialize()
        for record in dataStreamer:
            # process the record

        # Else if in interactive mode, add data using 'enqueue'.
        # (In a thread if the data must be buffered...)
        dataStreamer.enqueue(data) 

    Public methods:

        get, next, and __iter__

        enqueue(self, dictionary)
        initialize(self)  # Can only call this once.

    Internal methods:

        _read(self)
        _unitableCallback(self, uni_record)
        _xmlCallback(self, native_element)

    Data Members:

        _runOptions(NameSpace; contains initialization arguments)
        _logger (Logger)
        _queue (Queue): Queue of unprocessed, but read, data elements.
                        These should be either dictionaries or UniTables.
        _reader (Either None, a Reader, or HTTPInterfaceServer)
        _thread (Thread): Thread that will run the reader.
        _values (dictionary): The most recent row of data from _queue.
    """
    def __init__(self,
                 fromHTTP=False,
                 interactive=False,
                 isXML=True,
                 isCSV=False,
                 runForever=False,
                 maxsize=0,
                 filename=None,
                 **kwargs):
        """Set up the reading function and queue for the DataStreamer.

        DataStreamer's constructor is typically invoked by
        calling getDataStreamer(config_options), defined below.
        Error checking for appropriate configuration settings,
        and for sufficient contents in **kwargs is presumed to be
        done during XSD validation.  The reason this initialization
        function is separate is to allow an advanced user to call
        the streamer from a script and bypass having to make an
        XML object containing configuration settings.

        Arguments:

            fromHTTP (boolean; default False):
            If True, the reader will be an HTTPInterfaceServer.

            interactive (boolean; default False):
            If True, the reader will be None and the user will push
            data to the queue to score using self.enqueue(self, dictionary)
            in which dictionary is a dictionary or a UniRecord; a row in a
            UniTable.

            isXML (boolean; default False):
            If True, the reader will process the input stream as XML.

            runForever (boolean; default False):
            If True, run forever. Otherwise read all data and then exit.

            maxsize (integer; default 0):
            The maximum number of objects allowed in self.queue.
            If zero, the Queue can be arbitrarily long.

            **kwargs (arguments for the Reader)
        """
        self._runOptions =\
            NameSpace(
                fromHTTP=fromHTTP,
                interactive=interactive,
                isXML=isXML,
                runForever=runForever)
        self._fileList = filename  # None or else will become a list...
        self.currentFileNumber = 0
        self._logger = logging.getLogger()
        self._metadata = logging.getLogger('metadata')
        self._thread = None
        self._values = None
        self._queue = Queue.Queue(maxsize)
        callback = self._xmlCallback if isXML else self._unitableCallback

        if interactive:
            self._reader = None
        elif fromHTTP:

            def http_callback(data):
                wrapper = StringIO.StringIO(data)
                rdr =\
                    Reader(callback,
                        source=wrapper,
                        logger=self._logger,
                        magicheader=False,
                        unitable=not isXML,
                        wholeUniTable=not isXML)
                pipe = rdr.new_pipe()
                try:
                    result = rdr.feed_pipe(None, pipe)
                except:
                    raise IOError("Problem reading data over HTTP.")
                return result

            self._reader =\
                HTTPInterfaceServer(
                    ('', kwargs['port']), logger=logging.getLogger(''))
            self._reader.register_callback(kwargs['url'], http_callback)
            self._reader.isCSV = isCSV

        else:
            if filename == '-':
                self._fileList = ['-']
            else:
                import glob
                self._fileList = glob.glob(filename)
                self._fileList.sort()
                self._fileList.reverse()
            if len(self._fileList) == 0:
                raise RuntimeError, "No Data Input files matched %s" % filename

            self._reader = Reader(callback,
                                  unitable=not isXML,
                                  wholeUniTable=not isXML,
                                  **kwargs)
            self._reader.source = self._fileList.pop()
            self._reader.isCSV = isCSV

    def enqueue(self, dictionary):
        """Add the dictionary (or UniTable) to the queue.

        Arguments:

            dictionary format is {field1:value1, field2:value2}
        """
        try:
            self._queue.put(dictionary, timeout=0.1)
        except Queue.Full:
            self._logger.error("Data stream queue dropped:%s" % dictionary)

    def get(self, field):
        """Return the element's value or MISSING.
        
        Arguments:

            field (string): The name of a field in the dictionary/UniRecord

            * MISSING means the value is absent.
        """
        if self._values is None:
            return MISSING

        if field not in self._values.keys():
            self._logger.debug("Data not found for field: %s" % field)
            return MISSING

        output = self._values[field]

        if isinstance(output, float) and \
            (numpy.isnan(output) or numpy.isinf(output)):
            return INVALID

        return output

    def __iter__(self):
        return self

    def next(self):
        logDebug = self._logger.getEffectiveLevel() <= logging.DEBUG

        if self._runOptions.isXML:
            self._values = None
            # reset in order to get the next item
        elif self._values is not None:
            try:
                # Iterate over the UniTable
                self._values = self._values.next()
                if logDebug:
                    self._logger.debug("This record: %s" % self._values)
                return self._values
            except StopIteration:
                self._values = None
                # keep going; try to get the next UniTable

        try:
            self._values = self._queue.get(timeout=0.1)
            self._queue.task_done()
        except Queue.Empty:
            while self._values is None:
                if self._thread and self._thread.isAlive():
                    # If the Reader thread is still going, block until
                    # another result comes.
                    self._values = self._queue.get()
                    self._queue.task_done()
                    sleep(0)
                else:
                    # Otherwise reset my thread.
                    self._thread = None
                    # Step to the next file, if it exists.
                    if len(self._fileList) > 0:
                        self._reader.source = self._fileList.pop()
                        self.currentFileNumber += 1
                        self.initialize()
                    else:
                        raise StopIteration

        if not self._runOptions.isXML and self._values is not None:
            # Step into the UniTable
            self._values = self._values[0]
            if logDebug:
                self._logger.debug("This record: %s" % self._values)

            return self._values

        return self._values

    def _read(self):
        """The thread callback for enqueueing the data.

        When reading forever from a file handle (per the XSD,
        can only read forever FromHTTP or FromFifo) loop continuously
        around single reads.
        """
        if self._runOptions.interactive:
            self._logger.error(
                "DataStreamer._read is not for interactive mode.  "+\
                "Instead use DataStreamer.enqueue.")
            return

        if self._runOptions.fromHTTP:
            self._reader.serve_forever()
            return

        if self._runOptions.runForever:
            while True:
                try:
                    self._reader.read_forever()
                except KeyboardInterrupt:
                    self._logger.error("Keyboard Interrupt.")
                    raise
                except:
                    self._logger.error("error reading data: %s" %
                                       sys.exc_info()[0])
                sleep(0)
        else:
            self._metadata.startTiming('Time Reading Data')
            self._reader.read_once()
            self._metadata.stopTiming('Time Reading Data')

    def initialize(self):
        """Start receiving/reading information and posting to my queue.

        Launch a thread with target method to start reading from
        the data source.  The method depends on the run options set
        on initialization.
        """
        if self._runOptions.interactive:
            self._logger.error(
                "DataStreamer.initialize is not for interactive mode.  "+\
                "Instead use DataStreamer.enqueue.")
            return

        if self._thread:
            # Current implementation is to only read the source once.
            self._logger.error(
                "DataStream streaming invoked again on the same source.")
            return

        self._thread = threading.Thread(target=self._read)
        # Don't quit Python until after the thread is finished running.
        self._thread.daemon = False
        self._thread.start()

    def _unitableCallback(self, uni_table):
        self.enqueue(uni_table)

    def _xmlCallback(self, native_element):
        for row in native_element:
            obj = dict([(str(k), row.attr[k]) for k in row.attr])
            self.enqueue(obj)
예제 #3
0
def main(config, outfile=None, port=None):
  """Main function for controling scoring.  Config, if used should be a string containing a filename where a configuration file can be found."""
  #Read in a config file with a bunch of options describing where everything is
  consumer = pmmlConsumer()
  #The following two logging statements are worse than useless because 
  # they will cause 'No handlers could be found for logger "consumer"'
  # to be printed because we set up the logging handler while we're reading
  # the config file which happens at the end of this section.
  #consumer.logger.debug("Create Reader to get Configuration")
  config_reader = Reader(consumer.configure, source = str(config), magicheader = False, autoattr = False)
  #consumer.logger.debug("Read Config File")
  config_reader.read_once()

  #Overwrite the out file from the config file with the command line option if it was present.
  if outfile:
    consumer.output_filename = outfile
  #Create any reader or http server to read in data
  data_input = None
  run_forever = True
  run_daemon = False
  script_input = False
  
  #Check to make sure that we don't try to iterate over None
  if consumer.data_input_info is None:
    raise ConfigurationError("Data input source missing from configuration.")
  
  for item in consumer.data_input_info:
    if item.name == "readOnce":
      run_forever = False
    elif item.name == "batchScoring":
      consumer.batch_scoring = True
    elif item.name == "daemon":
      run_daemon = True
    elif data_input is not None:
      continue #Only process the first way that we are told to get the data.
    elif item.name == "fromFile" or item.name == "fromFifo":
      #No special treatment needed other than UniTable vs XML
      isUni = False
      filetype = None
      if 'type' in item.attr:
        filetype = item.attr['type']
      if filetype == "UniTable":
        isUni = True
      data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni, framing='EOF')
    elif item.name == "fromFixedRecordFile":
      isUni = True
      types = None
      ffnames = []
      ffstarts = []
      ffends = []
      fftypes = []
      start = 0
      for field in item:
        ffnames.append(field.attr['name'])
        ffstarts.append(start)
        ffends.append(start + int(field.attr['length']))
        start += int(field.attr['length'])
      if 'cr' in item.attr:
        ffCR = item.attr['cr']
      else:
        ffCR = None
      data_input = Reader(consumer.score, source = item.attr['name'],
        types = None,
        logger = consumer.logger, magicheader = False, unitable = isUni, ffConvert = ffConfig(ffnames, ffstarts, ffends, ffCR))
    elif item.name == "fromCSVFile":
      #We have a CSV file that needs special treatment to read in correctly
      isUni = True
      header = None
      sep = None
      types = None
      if 'header' in item.attr:
        header = item.attr['header']
      if 'sep' in item.attr:
        sep = item.attr['sep']
      if 'types' in item.attr:
        types = item.attr['types']
      data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni, header = header, sep = sep, types = types, framing = 'EOF')
    elif item.name == "fromStandardInput":
      isUni = False
      filetype = None
      sep = None
      types = None
      framing = 'EOF'
      if 'sep' in item.attr:
        sep = item.attr['sep']
      if 'types' in item.attr:
        types = item.attr['types']
      if 'type' in item.attr:
        filetype = item.attr['type']
      if filetype == "UniTable":
        isUni = True
      if 'framing' in item.attr:
        framing = item.attr['framing']
      consumer.logger.debug('...Test')
      data_input = Reader(consumer.score, source = "-", logger = consumer.logger, magicheader = False, unitable = isUni, sep = sep, types = types, framing = framing)
    elif item.name == "fromHTTP":
      #get the stuff we need to setup the server
      input_url = item.attr['url']
      if port:
        input_port = int(port)
      else:
        input_port = int(item.attr['port'])
      datatype = None
      if 'type' in item.attr:
        datatype = item.attr['type']
      if datatype == "UniTable":
        callback = consumer.score_http_uni
      else:
        callback = consumer.score_http_xml
      
      #Create the server
      data_input = HTTPInterfaceServer(('',input_port), logger = consumer.logger)
      #Add the callback
      data_input.register_callback(input_url, callback)
    elif item.name == "eventBased":
      script_input = True
      data_input = False #Dummy value to get past a check for None later.
    else:
      #Not recognized
      consumer.logger.debug("Element %s is not a recognized child element of inputData, ignoring." % (item.name))
  
  #TODO: ??? What does the following comment refer to?
  #If summary data is being requested, set it up
  
  if data_input is None:
    #We made it through the config information without finding a data input source.
    raise ConfigurationError("Unable to determine data input source.")
  
  consumer.logger.debug("Initialize model")
  #Initialize the model
  #TODO: ??? What does the following comment refer to?
  #this is after the data information is input so that batch scoring may be faster
  consumer.initialize_model()
  
  if script_input:
    #Another script has called main, return the consumer so it can handle how score is called.
    return consumer
  
  consumer.logger.warning("Ready to score")
  #Start scoring data
  if consumer.metadata:
    # By default, for now, enable collection of
    # metadata by data reader and model (consumer general metadata
    # is enabled earlier).
    data_input.enableMetaDataCollection()
    consumer.model.enableMetaDataCollection()
  if consumer.batch_scoring:
    if consumer.metadata:
      consumer.metadata.log.info('Batch Scoring -One Score Per Segment\n')
    consumer.logger.debug("Batch Scoring")
    if isinstance(data_input, Reader):
      data_input.read_once()
      report = consumer.format_results(consumer.model.batchScore())
      if consumer.output_filename:
        consumer.output_report_header(file_handle = consumer.out)
        consumer.out.write(report)
        consumer.output_report_footer(file_handle = consumer.out)
        consumer.out.close()
  elif run_forever:
    if consumer.metadata:
      consumer.metadata.log.info('Run Forever - One Score Per Event')
    consumer.logger.debug("Run Forever")
    if isinstance(data_input, Reader):
      consumer.output_report_header()
      data_input.read_forever()
      consumer.output_report_footer(consumer.out)
    elif isinstance(data_input, HTTPServer):
      data_input.serve_forever()
    else:
      consumer.logger.critical("Reading data failed.")
  else: #just read once
    finished = False
    while not finished:
      if consumer.metadata is not None:
        consumer.metadata.log.info('Run Once - One Score Per Event')
        consumer.metadata.log.info('Start at %s'%datetime.datetime.now().isoformat())
      consumer.logger.debug("Run Once")
      if isinstance(data_input, Reader):
        consumer.output_report_header()
        data_input.read_once()
        consumer.output_report_footer()
      elif isinstance(data_input, HTTPServer):
        data_input.handle_request()
      else:
        consumer.logger.critical("Reading data failed.")
      if consumer.metadata:
        consumer.metadata.log.info('End at %s'%datetime.datetime.now().isoformat())
      if run_daemon:
        signal.signal(signal.SIGALRM, daemonRestartHandler)
        signal.signal(signal.SIGUSR1, daemonRestartHandler)
        signal.pause() # unix only
        finished = False
      else:
        finished = True
  if consumer.metadata:
    consumer.metadata['Stacksize after Scoring'] = ptools.stacksize()
    consumer.metadata['Resident Memory after Scoring'] = ptools.resident()/1e+9 #Gb
    consumer.metadata['Memory after Scoring'] = ptools.memory()/1e+9 #Gb
    consumer.metadata.collected['DataInput'] = data_input.getMetaData()
    #consumer.metadata.collected['Scoring'] = consumer.metadata.getMetaData()
    consumer.metadata.collected['Scoring'] = consumer.getMetaData()
    consumer.metadata.collected[''] = consumer.model.getMetaData()
    consumer.metadata.report()
예제 #4
0
def main(config=None):
    """Main function for controling scoring.  Config, if used should be a string containing a filename where a configuration file can be found."""
    logging.basicConfig(level=logging.DEBUG)

    from optparse import OptionParser, make_option
    #define the options
    usage = "usage: %prog [options]"
    version = "%prog 0.3.3"
    options = [
        make_option("-c",
                    "--config",
                    metavar="config",
                    default="config.xml",
                    help="The configuration file name")
    ]
    parser = OptionParser(usage=usage, version=version, option_list=options)

    #parse the options
    if not config:
        (options, arguments) = parser.parse_args()
        config = options.config

    #Take in a bunch of options describing where everything is
    consumer = pmmlConsumer()
    consumer.logger.debug("Create Reader to get Configuration")
    config_reader = Reader(consumer.configure,
                           source=str(config),
                           magicheader=False,
                           autoattr=False)
    consumer.logger.debug("Read Config File")
    config_reader.read_once()

    #Create any reader or http server to read in data
    data_input = None
    run_forever = True

    #Check to make sure that we don't try to iterate over None
    if consumer.data_input_info is None:
        raise ConfigurationError(
            "Data input source missing from configuration.")

    for item in consumer.data_input_info:
        if item.name == "readOnce":
            run_forever = False
        elif item.name == "batchScoring":
            consumer.batch_scoring = True
        elif data_input is not None:
            continue  #Only process the first way that we are told to get the data.
        elif item.name == "fromFile" or item.name == "fromFifo":
            #No special treatment needed other than UniTable vs XML
            isUni = False
            filetype = None
            if 'type' in item.attr:
                filetype = item.attr['type']
            if filetype == "UniTable":
                isUni = True
            data_input = Reader(consumer.score,
                                source=item.attr['name'],
                                logger=consumer.logger,
                                magicheader=False,
                                unitable=isUni)
        elif item.name == "fromFixedRecordFile":
            isUni = True
            types = None
            ffnames = []
            ffstarts = []
            ffends = []
            fftypes = []
            start = 0
            for field in item:
                ffnames.append(field.attr['name'])
                ffstarts.append(start)
                ffends.append(start + int(field.attr['length']))
                start += int(field.attr['length'])
            if 'cr' in item.attr:
                ffCR = item.attr['cr']
            else:
                ffCR = None
            data_input = Reader(consumer.score,
                                source=item.attr['name'],
                                types=None,
                                logger=consumer.logger,
                                magicheader=False,
                                unitable=isUni,
                                ffConvert=ffConfig(ffnames, ffstarts, ffends,
                                                   ffCR))
        elif item.name == "fromCSVFile":
            #We have a CSV file that needs special treatment to read in correctly
            isUni = True
            header = None
            sep = None
            types = None
            if 'header' in item.attr:
                header = item.attr['header']
            if 'sep' in item.attr:
                sep = item.attr['sep']
            if 'types' in item.attr:
                types = item.attr['types']
            data_input = Reader(consumer.score,
                                source=item.attr['name'],
                                logger=consumer.logger,
                                magicheader=False,
                                unitable=isUni,
                                header=header,
                                sep=sep,
                                types=types)
        elif item.name == "fromStandardInput":
            isUni = False
            filetype = None
            if 'type' in item.attr:
                filetype = item.attr['type']
            if filetype == "UniTable":
                isUni = True
            data_input = Reader(consumer.score,
                                source="-",
                                logger=consumer.logger,
                                magicheader=False,
                                unitable=isUni)
        elif item.name == "fromHTTP":
            #get the stuff we need to setup the server
            input_url = item.attr['url']
            input_port = int(item.attr['port'])
            datatype = None
            if 'type' in item.attr:
                datatype = item.attr['type']
            if datatype == "UniTable":
                callback = consumer.score_http_uni
            else:
                callback = consumer.score_http_xml

            #Create the server
            data_input = HTTPInterfaceServer(('', input_port),
                                             logger=consumer.logger)
            #Add the callback
            data_input.register_callback(input_url, callback)
        else:
            #Not recognized
            consumer.logger.warning(
                "Element %s is not a recognized child element of inputData, ignoring."
                % (item.name))

    if data_input is None:
        raise ConfigurationError("Unable to determine data input source.")
    consumer.logger.debug("Initialize model")
    #Initalize the model
    #this is after the data information is input so that batch scoring may be faster
    consumer.initalize_model()
    consumer.logger.warning("Ready to score")
    #Start scoring data
    if consumer.batch_scoring:
        consumer.logger.debug("Batch Scoring")
        if isinstance(data_input, Reader):
            data_input.read_once()
            report = consumer.format_results(consumer.model.batchScore())
            if consumer.output_filename:
                out = open(consumer.output_filename, 'w')
                consumer.output_report_header(file_handle=out)
                out.write(report)
                consumer.output_report_footer(file_handle=out)
                out.close()
    elif run_forever:
        consumer.logger.debug("Run Forever")
        if isinstance(data_input, Reader):
            consumer.output_report_header()
            data_input.read_forever()
            consumer.output_report_footer()
        elif isinstance(data_input, HTTPServer):
            data_input.serve_forever()
        else:
            print "Reading data failed."
    else:  #just read once
        consumer.logger.debug("Run Once")
        if isinstance(data_input, Reader):
            consumer.output_report_header()
            data_input.read_once()
            consumer.output_report_footer()
        elif isinstance(data_input, HTTPServer):
            data_input.handle_request()
        else:
            print "Reading data failed."
예제 #5
0
class DataStreamer:
    """Contains a queue of data.  Its internal read function uses threading.

    Usage:

        dataStreamer = DataStreamer(config_options)
        # Get data_fields from the PMML model (assigned in this example).
        data_fields = {'field1':int, 'field2':str}

        # If reading from a source handle:
        dataStreamer.initialize()
        for record in dataStreamer:
            # process the record

        # Else if in interactive mode, add data using 'enqueue'.
        # (In a thread if the data must be buffered...)
        dataStreamer.enqueue(data) 

    Public methods:

        get, next, and __iter__

        enqueue(self, dictionary)
        initialize(self)  # Can only call this once.

    Internal methods:

        _read(self)
        _unitableCallback(self, uni_record)
        _xmlCallback(self, native_element)

    Data Members:

        _runOptions(NameSpace; contains initialization arguments)
        _logger (Logger)
        _queue (Queue): Queue of unprocessed, but read, data elements.
                        These should be either dictionaries or UniTables.
        _reader (Either None, a Reader, or HTTPInterfaceServer)
        _thread (Thread): Thread that will run the reader.
        _values (dictionary): The most recent row of data from _queue.
    """
    def __init__(
        self,
        fromHTTP=False, interactive=False, isXML=True, isCSV=False, runForever=False,
        maxsize=0, filename=None,
        **kwargs):
        """Set up the reading function and queue for the DataStreamer.

        DataStreamer's constructor is typically invoked by
        calling getDataStreamer(config_options), defined below.
        Error checking for appropriate configuration settings,
        and for sufficient contents in **kwargs is presumed to be
        done during XSD validation.  The reason this initialization
        function is separate is to allow an advanced user to call
        the streamer from a script and bypass having to make an
        XML object containing configuration settings.

        Arguments:

            fromHTTP (boolean; default False):
            If True, the reader will be an HTTPInterfaceServer.

            interactive (boolean; default False):
            If True, the reader will be None and the user will push
            data to the queue to score using self.enqueue(self, dictionary)
            in which dictionary is a dictionary or a UniRecord; a row in a
            UniTable.

            isXML (boolean; default False):
            If True, the reader will process the input stream as XML.

            runForever (boolean; default False):
            If True, run forever. Otherwise read all data and then exit.

            maxsize (integer; default 0):
            The maximum number of objects allowed in self.queue.
            If zero, the Queue can be arbitrarily long.

            **kwargs (arguments for the Reader)
        """
        self._runOptions =\
            NameSpace(
                fromHTTP=fromHTTP,
                interactive=interactive,
                isXML=isXML,
                runForever=runForever)
        self._fileList = filename  # None or else will become a list...
        self.currentFileNumber = 0
        self._logger = logging.getLogger()
        self._metadata = logging.getLogger('metadata')
        self._thread = None
        self._values = None
        self._queue = Queue.Queue(maxsize)
        callback = self._xmlCallback if isXML else self._unitableCallback

        if interactive:
            self._reader = None
        elif fromHTTP:
            def http_callback(data):
                wrapper = StringIO.StringIO(data)
                rdr =\
                    Reader(callback,
                        source=wrapper,
                        logger=self._logger,
                        magicheader=False,
                        unitable=not isXML,
                        wholeUniTable=not isXML)
                pipe = rdr.new_pipe()
                try:
                    result = rdr.feed_pipe(None, pipe)
                except:
                    raise IOError("Problem reading data over HTTP.")
                return result
    
            self._reader =\
                HTTPInterfaceServer(
                    ('', kwargs['port']), logger=logging.getLogger(''))
            self._reader.register_callback(kwargs['url'], http_callback)
            self._reader.isCSV = isCSV

        else:
            if filename == '-':
                self._fileList = ['-']
            else:
                import glob
                self._fileList = glob.glob(filename)
                self._fileList.sort()
                self._fileList.reverse()
            if len(self._fileList) == 0:
                raise RuntimeError, "No Data Input files matched %s" % filename

            self._reader = Reader(callback, unitable=not isXML, wholeUniTable=not isXML, **kwargs)
            self._reader.source = self._fileList.pop()
            self._reader.isCSV = isCSV

    def enqueue(self, dictionary):
        """Add the dictionary (or UniTable) to the queue.

        Arguments:

            dictionary format is {field1:value1, field2:value2}
        """
        try:
            self._queue.put(dictionary, timeout=0.1)
        except Queue.Full:
            self._logger.error("Data stream queue dropped:%s" % dictionary)

    def get(self, field):
        """Return the element's value or MISSING.
        
        Arguments:

            field (string): The name of a field in the dictionary/UniRecord

            * MISSING means the value is absent.
        """
        if self._values is None:
            return MISSING

        if field not in self._values.keys():
            self._logger.debug("Data not found for field: %s" % field)
            return MISSING

        output = self._values[field]

        if isinstance(output, float) and \
            (numpy.isnan(output) or numpy.isinf(output)):
            return INVALID

        return output

    def __iter__(self):
        return self

    def next(self):
        logDebug = self._logger.getEffectiveLevel() <= logging.DEBUG

        if self._runOptions.isXML:
            self._values = None
            # reset in order to get the next item
        elif self._values is not None:
            try:
                # Iterate over the UniTable
                self._values = self._values.next()
                if logDebug:
                    self._logger.debug("This record: %s" %self._values)
                return self._values
            except StopIteration:
                self._values = None
                # keep going; try to get the next UniTable

        try:
            self._values = self._queue.get(timeout=0.1)
            self._queue.task_done()
        except Queue.Empty:
            while self._values is None:
                if self._thread and self._thread.isAlive():
                    # If the Reader thread is still going, block until
                    # another result comes.
                    self._values = self._queue.get()
                    self._queue.task_done()
                    sleep(0)
                else:
                    # Otherwise reset my thread.
                    self._thread = None
                    # Step to the next file, if it exists.
                    if len(self._fileList) > 0:
                        self._reader.source = self._fileList.pop()
                        self.currentFileNumber += 1
                        self.initialize()
                    else:
                        raise StopIteration

        if not self._runOptions.isXML and self._values is not None:
            # Step into the UniTable
            self._values = self._values[0]
            if logDebug:
                self._logger.debug("This record: %s" %self._values)

            return self._values

        return self._values

    def _read(self):
        """The thread callback for enqueueing the data.

        When reading forever from a file handle (per the XSD,
        can only read forever FromHTTP or FromFifo) loop continuously
        around single reads.
        """
        if self._runOptions.interactive:
            self._logger.error(
                "DataStreamer._read is not for interactive mode.  "+\
                "Instead use DataStreamer.enqueue.")
            return

        if self._runOptions.fromHTTP:
            self._reader.serve_forever()
            return

        if self._runOptions.runForever:
            while True:
                try:
                    self._reader.read_forever()
                except KeyboardInterrupt:
                    self._logger.error("Keyboard Interrupt.")
                    raise
                except:
                    self._logger.error(
                        "error reading data: %s" % sys.exc_info()[0])
                sleep(0)
        else:
            self._metadata.startTiming('Time Reading Data')
            self._reader.read_once()
            self._metadata.stopTiming('Time Reading Data')

    def initialize(self):
        """Start receiving/reading information and posting to my queue.

        Launch a thread with target method to start reading from
        the data source.  The method depends on the run options set
        on initialization.
        """
        if self._runOptions.interactive:
            self._logger.error(
                "DataStreamer.initialize is not for interactive mode.  "+\
                "Instead use DataStreamer.enqueue.")
            return

        if self._thread:
            # Current implementation is to only read the source once.
            self._logger.error("DataStream streaming invoked again on the same source.")
            return

        self._thread = threading.Thread(target=self._read)
        # Don't quit Python until after the thread is finished running.
        self._thread.daemon = False
        self._thread.start()

    def _unitableCallback(self, uni_table):
        self.enqueue(uni_table)

    def _xmlCallback(self, native_element):
        for row in native_element:
            obj = dict([(str(k), row.attr[k]) for k in row.attr])
            self.enqueue(obj)