コード例 #1
0
 def score_http_xml(self, data):
   """This function should be used as the callback for the HTTP server when the data is expected to be XML."""
   wrapper = StringIO(data) #wrap the data in a StringIO object
   rdr = Reader(self.score, source = wrapper, logger = self.logger, magicheader = False, unitable = False)
   pipe = rdr.new_pipe()
   #try:
   return rdr.feed_pipe(None, pipe)
コード例 #2
0
 def score_http_uni(self,data):
   """This function should be used as the callback for the HTTP server when the data is expected to be a UniTable or csv file."""
   wrapper = StringIO(data) #wrap the data in a StringIO object
   rdr = Reader(self.score, source = wrapper, logger = self.logger, magicheader = False, unitable = True)
   pipe = rdr.new_pipe()
   try:
     return rdr.feed_pipe(None, pipe)
   except:
     self.logger.error("something broke.")
   pass
コード例 #3
0
 def http_callback(data):
     wrapper = StringIO.StringIO(data)
     rdr =\
         Reader(callback,
             source=wrapper,
             logger=self._logger,
             magicheader=False,
             unitable=not isXML,
             wholeUniTable=not isXML)
     pipe = rdr.new_pipe()
     try:
         result = rdr.feed_pipe(None, pipe)
     except:
         raise IOError("Problem reading data over HTTP.")
     return result
コード例 #4
0
    def __init__(self,
                 fromHTTP=False,
                 interactive=False,
                 isXML=True,
                 isCSV=False,
                 runForever=False,
                 maxsize=0,
                 filename=None,
                 **kwargs):
        """Set up the reading function and queue for the DataStreamer.

        DataStreamer's constructor is typically invoked by
        calling getDataStreamer(config_options), defined below.
        Error checking for appropriate configuration settings,
        and for sufficient contents in **kwargs is presumed to be
        done during XSD validation.  The reason this initialization
        function is separate is to allow an advanced user to call
        the streamer from a script and bypass having to make an
        XML object containing configuration settings.

        Arguments:

            fromHTTP (boolean; default False):
            If True, the reader will be an HTTPInterfaceServer.

            interactive (boolean; default False):
            If True, the reader will be None and the user will push
            data to the queue to score using self.enqueue(self, dictionary)
            in which dictionary is a dictionary or a UniRecord; a row in a
            UniTable.

            isXML (boolean; default False):
            If True, the reader will process the input stream as XML.

            runForever (boolean; default False):
            If True, run forever. Otherwise read all data and then exit.

            maxsize (integer; default 0):
            The maximum number of objects allowed in self.queue.
            If zero, the Queue can be arbitrarily long.

            **kwargs (arguments for the Reader)
        """
        self._runOptions =\
            NameSpace(
                fromHTTP=fromHTTP,
                interactive=interactive,
                isXML=isXML,
                runForever=runForever)
        self._fileList = filename  # None or else will become a list...
        self.currentFileNumber = 0
        self._logger = logging.getLogger()
        self._metadata = logging.getLogger('metadata')
        self._thread = None
        self._values = None
        self._queue = Queue.Queue(maxsize)
        callback = self._xmlCallback if isXML else self._unitableCallback

        if interactive:
            self._reader = None
        elif fromHTTP:

            def http_callback(data):
                wrapper = StringIO.StringIO(data)
                rdr =\
                    Reader(callback,
                        source=wrapper,
                        logger=self._logger,
                        magicheader=False,
                        unitable=not isXML,
                        wholeUniTable=not isXML)
                pipe = rdr.new_pipe()
                try:
                    result = rdr.feed_pipe(None, pipe)
                except:
                    raise IOError("Problem reading data over HTTP.")
                return result

            self._reader =\
                HTTPInterfaceServer(
                    ('', kwargs['port']), logger=logging.getLogger(''))
            self._reader.register_callback(kwargs['url'], http_callback)
            self._reader.isCSV = isCSV

        else:
            if filename == '-':
                self._fileList = ['-']
            else:
                import glob
                self._fileList = glob.glob(filename)
                self._fileList.sort()
                self._fileList.reverse()
            if len(self._fileList) == 0:
                raise RuntimeError, "No Data Input files matched %s" % filename

            self._reader = Reader(callback,
                                  unitable=not isXML,
                                  wholeUniTable=not isXML,
                                  **kwargs)
            self._reader.source = self._fileList.pop()
            self._reader.isCSV = isCSV
コード例 #5
0
ファイル: pmmlConsumer.py プロジェクト: soedjais/augustus
def main(config=None):
    """Main function for controling scoring.  Config, if used should be a string containing a filename where a configuration file can be found."""
    logging.basicConfig(level=logging.DEBUG)

    from optparse import OptionParser, make_option
    #define the options
    usage = "usage: %prog [options]"
    version = "%prog 0.3.3"
    options = [
        make_option("-c",
                    "--config",
                    metavar="config",
                    default="config.xml",
                    help="The configuration file name")
    ]
    parser = OptionParser(usage=usage, version=version, option_list=options)

    #parse the options
    if not config:
        (options, arguments) = parser.parse_args()
        config = options.config

    #Take in a bunch of options describing where everything is
    consumer = pmmlConsumer()
    consumer.logger.debug("Create Reader to get Configuration")
    config_reader = Reader(consumer.configure,
                           source=str(config),
                           magicheader=False,
                           autoattr=False)
    consumer.logger.debug("Read Config File")
    config_reader.read_once()

    #Create any reader or http server to read in data
    data_input = None
    run_forever = True

    #Check to make sure that we don't try to iterate over None
    if consumer.data_input_info is None:
        raise ConfigurationError(
            "Data input source missing from configuration.")

    for item in consumer.data_input_info:
        if item.name == "readOnce":
            run_forever = False
        elif item.name == "batchScoring":
            consumer.batch_scoring = True
        elif data_input is not None:
            continue  #Only process the first way that we are told to get the data.
        elif item.name == "fromFile" or item.name == "fromFifo":
            #No special treatment needed other than UniTable vs XML
            isUni = False
            filetype = None
            if 'type' in item.attr:
                filetype = item.attr['type']
            if filetype == "UniTable":
                isUni = True
            data_input = Reader(consumer.score,
                                source=item.attr['name'],
                                logger=consumer.logger,
                                magicheader=False,
                                unitable=isUni)
        elif item.name == "fromFixedRecordFile":
            isUni = True
            types = None
            ffnames = []
            ffstarts = []
            ffends = []
            fftypes = []
            start = 0
            for field in item:
                ffnames.append(field.attr['name'])
                ffstarts.append(start)
                ffends.append(start + int(field.attr['length']))
                start += int(field.attr['length'])
            if 'cr' in item.attr:
                ffCR = item.attr['cr']
            else:
                ffCR = None
            data_input = Reader(consumer.score,
                                source=item.attr['name'],
                                types=None,
                                logger=consumer.logger,
                                magicheader=False,
                                unitable=isUni,
                                ffConvert=ffConfig(ffnames, ffstarts, ffends,
                                                   ffCR))
        elif item.name == "fromCSVFile":
            #We have a CSV file that needs special treatment to read in correctly
            isUni = True
            header = None
            sep = None
            types = None
            if 'header' in item.attr:
                header = item.attr['header']
            if 'sep' in item.attr:
                sep = item.attr['sep']
            if 'types' in item.attr:
                types = item.attr['types']
            data_input = Reader(consumer.score,
                                source=item.attr['name'],
                                logger=consumer.logger,
                                magicheader=False,
                                unitable=isUni,
                                header=header,
                                sep=sep,
                                types=types)
        elif item.name == "fromStandardInput":
            isUni = False
            filetype = None
            if 'type' in item.attr:
                filetype = item.attr['type']
            if filetype == "UniTable":
                isUni = True
            data_input = Reader(consumer.score,
                                source="-",
                                logger=consumer.logger,
                                magicheader=False,
                                unitable=isUni)
        elif item.name == "fromHTTP":
            #get the stuff we need to setup the server
            input_url = item.attr['url']
            input_port = int(item.attr['port'])
            datatype = None
            if 'type' in item.attr:
                datatype = item.attr['type']
            if datatype == "UniTable":
                callback = consumer.score_http_uni
            else:
                callback = consumer.score_http_xml

            #Create the server
            data_input = HTTPInterfaceServer(('', input_port),
                                             logger=consumer.logger)
            #Add the callback
            data_input.register_callback(input_url, callback)
        else:
            #Not recognized
            consumer.logger.warning(
                "Element %s is not a recognized child element of inputData, ignoring."
                % (item.name))

    if data_input is None:
        raise ConfigurationError("Unable to determine data input source.")
    consumer.logger.debug("Initialize model")
    #Initalize the model
    #this is after the data information is input so that batch scoring may be faster
    consumer.initalize_model()
    consumer.logger.warning("Ready to score")
    #Start scoring data
    if consumer.batch_scoring:
        consumer.logger.debug("Batch Scoring")
        if isinstance(data_input, Reader):
            data_input.read_once()
            report = consumer.format_results(consumer.model.batchScore())
            if consumer.output_filename:
                out = open(consumer.output_filename, 'w')
                consumer.output_report_header(file_handle=out)
                out.write(report)
                consumer.output_report_footer(file_handle=out)
                out.close()
    elif run_forever:
        consumer.logger.debug("Run Forever")
        if isinstance(data_input, Reader):
            consumer.output_report_header()
            data_input.read_forever()
            consumer.output_report_footer()
        elif isinstance(data_input, HTTPServer):
            data_input.serve_forever()
        else:
            print "Reading data failed."
    else:  #just read once
        consumer.logger.debug("Run Once")
        if isinstance(data_input, Reader):
            consumer.output_report_header()
            data_input.read_once()
            consumer.output_report_footer()
        elif isinstance(data_input, HTTPServer):
            data_input.handle_request()
        else:
            print "Reading data failed."
コード例 #6
0
def main(config, outfile=None, port=None):
  """Main function for controling scoring.  Config, if used should be a string containing a filename where a configuration file can be found."""
  #Read in a config file with a bunch of options describing where everything is
  consumer = pmmlConsumer()
  #The following two logging statements are worse than useless because 
  # they will cause 'No handlers could be found for logger "consumer"'
  # to be printed because we set up the logging handler while we're reading
  # the config file which happens at the end of this section.
  #consumer.logger.debug("Create Reader to get Configuration")
  config_reader = Reader(consumer.configure, source = str(config), magicheader = False, autoattr = False)
  #consumer.logger.debug("Read Config File")
  config_reader.read_once()

  #Overwrite the out file from the config file with the command line option if it was present.
  if outfile:
    consumer.output_filename = outfile
  #Create any reader or http server to read in data
  data_input = None
  run_forever = True
  run_daemon = False
  script_input = False
  
  #Check to make sure that we don't try to iterate over None
  if consumer.data_input_info is None:
    raise ConfigurationError("Data input source missing from configuration.")
  
  for item in consumer.data_input_info:
    if item.name == "readOnce":
      run_forever = False
    elif item.name == "batchScoring":
      consumer.batch_scoring = True
    elif item.name == "daemon":
      run_daemon = True
    elif data_input is not None:
      continue #Only process the first way that we are told to get the data.
    elif item.name == "fromFile" or item.name == "fromFifo":
      #No special treatment needed other than UniTable vs XML
      isUni = False
      filetype = None
      if 'type' in item.attr:
        filetype = item.attr['type']
      if filetype == "UniTable":
        isUni = True
      data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni, framing='EOF')
    elif item.name == "fromFixedRecordFile":
      isUni = True
      types = None
      ffnames = []
      ffstarts = []
      ffends = []
      fftypes = []
      start = 0
      for field in item:
        ffnames.append(field.attr['name'])
        ffstarts.append(start)
        ffends.append(start + int(field.attr['length']))
        start += int(field.attr['length'])
      if 'cr' in item.attr:
        ffCR = item.attr['cr']
      else:
        ffCR = None
      data_input = Reader(consumer.score, source = item.attr['name'],
        types = None,
        logger = consumer.logger, magicheader = False, unitable = isUni, ffConvert = ffConfig(ffnames, ffstarts, ffends, ffCR))
    elif item.name == "fromCSVFile":
      #We have a CSV file that needs special treatment to read in correctly
      isUni = True
      header = None
      sep = None
      types = None
      if 'header' in item.attr:
        header = item.attr['header']
      if 'sep' in item.attr:
        sep = item.attr['sep']
      if 'types' in item.attr:
        types = item.attr['types']
      data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni, header = header, sep = sep, types = types, framing = 'EOF')
    elif item.name == "fromStandardInput":
      isUni = False
      filetype = None
      sep = None
      types = None
      framing = 'EOF'
      if 'sep' in item.attr:
        sep = item.attr['sep']
      if 'types' in item.attr:
        types = item.attr['types']
      if 'type' in item.attr:
        filetype = item.attr['type']
      if filetype == "UniTable":
        isUni = True
      if 'framing' in item.attr:
        framing = item.attr['framing']
      consumer.logger.debug('...Test')
      data_input = Reader(consumer.score, source = "-", logger = consumer.logger, magicheader = False, unitable = isUni, sep = sep, types = types, framing = framing)
    elif item.name == "fromHTTP":
      #get the stuff we need to setup the server
      input_url = item.attr['url']
      if port:
        input_port = int(port)
      else:
        input_port = int(item.attr['port'])
      datatype = None
      if 'type' in item.attr:
        datatype = item.attr['type']
      if datatype == "UniTable":
        callback = consumer.score_http_uni
      else:
        callback = consumer.score_http_xml
      
      #Create the server
      data_input = HTTPInterfaceServer(('',input_port), logger = consumer.logger)
      #Add the callback
      data_input.register_callback(input_url, callback)
    elif item.name == "eventBased":
      script_input = True
      data_input = False #Dummy value to get past a check for None later.
    else:
      #Not recognized
      consumer.logger.debug("Element %s is not a recognized child element of inputData, ignoring." % (item.name))
  
  #TODO: ??? What does the following comment refer to?
  #If summary data is being requested, set it up
  
  if data_input is None:
    #We made it through the config information without finding a data input source.
    raise ConfigurationError("Unable to determine data input source.")
  
  consumer.logger.debug("Initialize model")
  #Initialize the model
  #TODO: ??? What does the following comment refer to?
  #this is after the data information is input so that batch scoring may be faster
  consumer.initialize_model()
  
  if script_input:
    #Another script has called main, return the consumer so it can handle how score is called.
    return consumer
  
  consumer.logger.warning("Ready to score")
  #Start scoring data
  if consumer.metadata:
    # By default, for now, enable collection of
    # metadata by data reader and model (consumer general metadata
    # is enabled earlier).
    data_input.enableMetaDataCollection()
    consumer.model.enableMetaDataCollection()
  if consumer.batch_scoring:
    if consumer.metadata:
      consumer.metadata.log.info('Batch Scoring -One Score Per Segment\n')
    consumer.logger.debug("Batch Scoring")
    if isinstance(data_input, Reader):
      data_input.read_once()
      report = consumer.format_results(consumer.model.batchScore())
      if consumer.output_filename:
        consumer.output_report_header(file_handle = consumer.out)
        consumer.out.write(report)
        consumer.output_report_footer(file_handle = consumer.out)
        consumer.out.close()
  elif run_forever:
    if consumer.metadata:
      consumer.metadata.log.info('Run Forever - One Score Per Event')
    consumer.logger.debug("Run Forever")
    if isinstance(data_input, Reader):
      consumer.output_report_header()
      data_input.read_forever()
      consumer.output_report_footer(consumer.out)
    elif isinstance(data_input, HTTPServer):
      data_input.serve_forever()
    else:
      consumer.logger.critical("Reading data failed.")
  else: #just read once
    finished = False
    while not finished:
      if consumer.metadata is not None:
        consumer.metadata.log.info('Run Once - One Score Per Event')
        consumer.metadata.log.info('Start at %s'%datetime.datetime.now().isoformat())
      consumer.logger.debug("Run Once")
      if isinstance(data_input, Reader):
        consumer.output_report_header()
        data_input.read_once()
        consumer.output_report_footer()
      elif isinstance(data_input, HTTPServer):
        data_input.handle_request()
      else:
        consumer.logger.critical("Reading data failed.")
      if consumer.metadata:
        consumer.metadata.log.info('End at %s'%datetime.datetime.now().isoformat())
      if run_daemon:
        signal.signal(signal.SIGALRM, daemonRestartHandler)
        signal.signal(signal.SIGUSR1, daemonRestartHandler)
        signal.pause() # unix only
        finished = False
      else:
        finished = True
  if consumer.metadata:
    consumer.metadata['Stacksize after Scoring'] = ptools.stacksize()
    consumer.metadata['Resident Memory after Scoring'] = ptools.resident()/1e+9 #Gb
    consumer.metadata['Memory after Scoring'] = ptools.memory()/1e+9 #Gb
    consumer.metadata.collected['DataInput'] = data_input.getMetaData()
    #consumer.metadata.collected['Scoring'] = consumer.metadata.getMetaData()
    consumer.metadata.collected['Scoring'] = consumer.getMetaData()
    consumer.metadata.collected[''] = consumer.model.getMetaData()
    consumer.metadata.report()