def initialize_model(self): """This function actually inputs the file containing the PMML model""" self.logger.debug("Initializing") if self.model_input_info: input_type = self.model_input_info[0] if input_type.name == "fromFile" or input_type.name == "fromFifo": model_source = input_type.attr['name'] elif input_type == "fromHTTP": pass #do stuff else: raise ConfigurationError("Unable to determine model input source.") else: raise ConfigurationError("inputModel tag missing from configuration XML.") self.logger.debug("Create reader for PMML") #Create pmmlReader to read in pmml file model_reader = pmmlReader() self.logger.debug("Parse PMML") #-------------------------- if self.metadata is not None : parseStart = datetime.datetime.now() model_reader.parse(model_source, self.logger) if self.metadata is not None : self.logger.debug("Save PMML Parsing Time") self.metadata['Time Parsing PMML'] += datetime.datetime.now() - parseStart self.logger.debug("Calculate Resources") self.logger.debug("Save stacksize") self.metadata['Stacksize after Parsing PMML'] = ptools.stacksize() self.logger.debug("Save Resident Memory") self.metadata['Resident Memory after Parsing PMML'] = ptools.resident()/1e+9 self.logger.debug("Save Memory after parsing PMML") self.metadata['Memory after Parsing PMML'] = ptools.memory()/1e+9 self.logger.debug("Save User Time after Parsing PMML") if not (platform.system() in ('Windows', 'Win', 'Microsoft')): import resource resources = resource.getrusage(resource.RUSAGE_SELF) self.metadata['User Time after Parsing PMML'] = resources.ru_utime otherStart = datetime.datetime.now() self.myPMML = model_reader.root #---------------------- self.logger.debug("Initialize Model") self.myPMML.initialize(self.get_data, self.needed_cols) self.logger.debug("find model") #Get the model if os.path.exists('pickledModel'): # The following section is used for testing idea of pickled models. self.model = cPickle.load(file('pickledModel')) else: self.model = self.myPMML.getChildrenOfType(pmmlModels)[0] # The following section is used for testing idea of pickled models. if False: savedModel = model_reader.root self.logger.debug(dir(savedModel)) #savedModel.initialize(self.get_data, self.needed_cols) cPickle.dump(savedModel, file('pickledModel','w')) if self.metadata is not None: self.metadata['Consumer Initialization'] += datetime.datetime.now() - otherStart self.metadata['Stacksize after Consumer Initialization'] = ptools.stacksize() self.metadata['Resident Memory after Consumer Initialization'] = ptools.resident()/1e+9 self.metadata['Memory after Consumer Initialization'] = ptools.memory()/1e+9
def main(config, outfile=None, port=None): """Main function for controling scoring. Config, if used should be a string containing a filename where a configuration file can be found.""" #Read in a config file with a bunch of options describing where everything is consumer = pmmlConsumer() #The following two logging statements are worse than useless because # they will cause 'No handlers could be found for logger "consumer"' # to be printed because we set up the logging handler while we're reading # the config file which happens at the end of this section. #consumer.logger.debug("Create Reader to get Configuration") config_reader = Reader(consumer.configure, source = str(config), magicheader = False, autoattr = False) #consumer.logger.debug("Read Config File") config_reader.read_once() #Overwrite the out file from the config file with the command line option if it was present. if outfile: consumer.output_filename = outfile #Create any reader or http server to read in data data_input = None run_forever = True run_daemon = False script_input = False #Check to make sure that we don't try to iterate over None if consumer.data_input_info is None: raise ConfigurationError("Data input source missing from configuration.") for item in consumer.data_input_info: if item.name == "readOnce": run_forever = False elif item.name == "batchScoring": consumer.batch_scoring = True elif item.name == "daemon": run_daemon = True elif data_input is not None: continue #Only process the first way that we are told to get the data. elif item.name == "fromFile" or item.name == "fromFifo": #No special treatment needed other than UniTable vs XML isUni = False filetype = None if 'type' in item.attr: filetype = item.attr['type'] if filetype == "UniTable": isUni = True data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni, framing='EOF') elif item.name == "fromFixedRecordFile": isUni = True types = None ffnames = [] ffstarts = [] ffends = [] fftypes = [] start = 0 for field in item: ffnames.append(field.attr['name']) ffstarts.append(start) ffends.append(start + int(field.attr['length'])) start += int(field.attr['length']) if 'cr' in item.attr: ffCR = item.attr['cr'] else: ffCR = None data_input = Reader(consumer.score, source = item.attr['name'], types = None, logger = consumer.logger, magicheader = False, unitable = isUni, ffConvert = ffConfig(ffnames, ffstarts, ffends, ffCR)) elif item.name == "fromCSVFile": #We have a CSV file that needs special treatment to read in correctly isUni = True header = None sep = None types = None if 'header' in item.attr: header = item.attr['header'] if 'sep' in item.attr: sep = item.attr['sep'] if 'types' in item.attr: types = item.attr['types'] data_input = Reader(consumer.score, source = item.attr['name'], logger = consumer.logger, magicheader = False, unitable = isUni, header = header, sep = sep, types = types, framing = 'EOF') elif item.name == "fromStandardInput": isUni = False filetype = None sep = None types = None framing = 'EOF' if 'sep' in item.attr: sep = item.attr['sep'] if 'types' in item.attr: types = item.attr['types'] if 'type' in item.attr: filetype = item.attr['type'] if filetype == "UniTable": isUni = True if 'framing' in item.attr: framing = item.attr['framing'] consumer.logger.debug('...Test') data_input = Reader(consumer.score, source = "-", logger = consumer.logger, magicheader = False, unitable = isUni, sep = sep, types = types, framing = framing) elif item.name == "fromHTTP": #get the stuff we need to setup the server input_url = item.attr['url'] if port: input_port = int(port) else: input_port = int(item.attr['port']) datatype = None if 'type' in item.attr: datatype = item.attr['type'] if datatype == "UniTable": callback = consumer.score_http_uni else: callback = consumer.score_http_xml #Create the server data_input = HTTPInterfaceServer(('',input_port), logger = consumer.logger) #Add the callback data_input.register_callback(input_url, callback) elif item.name == "eventBased": script_input = True data_input = False #Dummy value to get past a check for None later. else: #Not recognized consumer.logger.debug("Element %s is not a recognized child element of inputData, ignoring." % (item.name)) #TODO: ??? What does the following comment refer to? #If summary data is being requested, set it up if data_input is None: #We made it through the config information without finding a data input source. raise ConfigurationError("Unable to determine data input source.") consumer.logger.debug("Initialize model") #Initialize the model #TODO: ??? What does the following comment refer to? #this is after the data information is input so that batch scoring may be faster consumer.initialize_model() if script_input: #Another script has called main, return the consumer so it can handle how score is called. return consumer consumer.logger.warning("Ready to score") #Start scoring data if consumer.metadata: # By default, for now, enable collection of # metadata by data reader and model (consumer general metadata # is enabled earlier). data_input.enableMetaDataCollection() consumer.model.enableMetaDataCollection() if consumer.batch_scoring: if consumer.metadata: consumer.metadata.log.info('Batch Scoring -One Score Per Segment\n') consumer.logger.debug("Batch Scoring") if isinstance(data_input, Reader): data_input.read_once() report = consumer.format_results(consumer.model.batchScore()) if consumer.output_filename: consumer.output_report_header(file_handle = consumer.out) consumer.out.write(report) consumer.output_report_footer(file_handle = consumer.out) consumer.out.close() elif run_forever: if consumer.metadata: consumer.metadata.log.info('Run Forever - One Score Per Event') consumer.logger.debug("Run Forever") if isinstance(data_input, Reader): consumer.output_report_header() data_input.read_forever() consumer.output_report_footer(consumer.out) elif isinstance(data_input, HTTPServer): data_input.serve_forever() else: consumer.logger.critical("Reading data failed.") else: #just read once finished = False while not finished: if consumer.metadata is not None: consumer.metadata.log.info('Run Once - One Score Per Event') consumer.metadata.log.info('Start at %s'%datetime.datetime.now().isoformat()) consumer.logger.debug("Run Once") if isinstance(data_input, Reader): consumer.output_report_header() data_input.read_once() consumer.output_report_footer() elif isinstance(data_input, HTTPServer): data_input.handle_request() else: consumer.logger.critical("Reading data failed.") if consumer.metadata: consumer.metadata.log.info('End at %s'%datetime.datetime.now().isoformat()) if run_daemon: signal.signal(signal.SIGALRM, daemonRestartHandler) signal.signal(signal.SIGUSR1, daemonRestartHandler) signal.pause() # unix only finished = False else: finished = True if consumer.metadata: consumer.metadata['Stacksize after Scoring'] = ptools.stacksize() consumer.metadata['Resident Memory after Scoring'] = ptools.resident()/1e+9 #Gb consumer.metadata['Memory after Scoring'] = ptools.memory()/1e+9 #Gb consumer.metadata.collected['DataInput'] = data_input.getMetaData() #consumer.metadata.collected['Scoring'] = consumer.metadata.getMetaData() consumer.metadata.collected['Scoring'] = consumer.getMetaData() consumer.metadata.collected[''] = consumer.model.getMetaData() consumer.metadata.report()