def getSwarmModelParams(modelID): """Retrieve the Engine-level model params from a Swarm model Args: modelID - Engine-level model ID of the Swarm model Returns: JSON-encoded string containing Model Params """ # TODO: the use of opfhelpers.loadExperimentDescriptionScriptFromDir when # retrieving module params results in a leakage of pf_base_descriptionNN and # pf_descriptionNN module imports for every call to getSwarmModelParams, so # the leakage is unlimited when getSwarmModelParams is called by a # long-running process. This issue is presently being # tracked by the JIRA: https://issues.numenta.org/browse/NPC-225. An # alternate solution is to execute the guts of this function's logic in a # seprate process (via multiprocessing module). cjDAO = ClientJobsDAO.get() (jobID, description) = cjDAO.modelsGetFields(modelID, ["jobId", "genDescription"]) (baseDescription, ) = cjDAO.jobGetFields(jobID, ["genBaseDescription"]) # Construct a directory with base.py and description.py for loading model # params, and use opfhelpers to extract model params from those files descriptionDirectory = tempfile.mkdtemp() try: baseDescriptionFilePath = os.path.join(descriptionDirectory, "base.py") with open(baseDescriptionFilePath, mode="wb") as f: f.write(baseDescription) descriptionFilePath = os.path.join(descriptionDirectory, "description.py") with open(descriptionFilePath, mode="wb") as f: f.write(description) expIface = opfhelpers.getExperimentDescriptionInterfaceFromModule( opfhelpers.loadExperimentDescriptionScriptFromDir( descriptionDirectory)) return json.dumps( dict(modelConfig=expIface.getModelDescription(), inferenceArgs=expIface.getModelControl().get( "inferenceArgs", None))) finally: shutil.rmtree(descriptionDirectory, ignore_errors=True)
def getSwarmModelParams(modelID): """Retrieve the Engine-level model params from a Swarm model Args: modelID - Engine-level model ID of the Swarm model Returns: JSON-encoded string containing Model Params """ # TODO: the use of opfhelpers.loadExperimentDescriptionScriptFromDir when # retrieving module params results in a leakage of pf_base_descriptionNN and # pf_descriptionNN module imports for every call to getSwarmModelParams, so # the leakage is unlimited when getSwarmModelParams is called by a # long-running process such as grok-api-server. This issue is presently being # tracked by the JIRA: https://issues.numenta.org/browse/NPC-225. An # alternate solution is to execute the guts of this function's logic in a # seprate process (via multiprocessing module). cjDAO = ClientJobsDAO.get() (jobID, description) = cjDAO.modelsGetFields( modelID, ["jobId", "genDescription"]) (baseDescription,) = cjDAO.jobGetFields(jobID, ["genBaseDescription"]) # Construct a directory with base.py and description.py for loading model # params, and use opfhelpers to extract model params from those files descriptionDirectory = tempfile.mkdtemp() try: baseDescriptionFilePath = os.path.join(descriptionDirectory, "base.py") with open(baseDescriptionFilePath, mode="wb") as f: f.write(baseDescription) descriptionFilePath = os.path.join(descriptionDirectory, "description.py") with open(descriptionFilePath, mode="wb") as f: f.write(description) expIface = opfhelpers.getExperimentDescriptionInterfaceFromModule( opfhelpers.loadExperimentDescriptionScriptFromDir(descriptionDirectory)) return json.dumps( dict( modelConfig=expIface.getModelDescription(), inferenceArgs=expIface.getModelControl().get("inferenceArgs", None))) finally: shutil.rmtree(descriptionDirectory, ignore_errors=True)
def createAndStartSwarm(client, clientInfo="", clientKey="", params="", minimumWorkers=None, maximumWorkers=None, alreadyRunning=False): """Create and start a swarm job. Args: client - A string identifying the calling client. There is a small limit for the length of the value. See ClientJobsDAO.CLIENT_MAX_LEN. clientInfo - JSON encoded dict of client specific information. clientKey - Foreign key. Limited in length, see ClientJobsDAO._initTables. params - JSON encoded dict of the parameters for the job. This can be fetched out of the database by the worker processes based on the jobID. minimumWorkers - The minimum workers to allocate to the swarm. Set to None to use the default. maximumWorkers - The maximum workers to allocate to the swarm. Set to None to use the swarm default. Set to 0 to use the maximum scheduler value. alreadyRunning - Insert a job record for an already running process. Used for testing. """ if minimumWorkers is None: minimumWorkers = Configuration.getInt( "nupic.hypersearch.minWorkersPerSwarm") if maximumWorkers is None: maximumWorkers = Configuration.getInt( "nupic.hypersearch.maxWorkersPerSwarm") return ClientJobsDAO.get().jobInsert(client=client, cmdLine="$HYPERSEARCH", clientInfo=clientInfo, clientKey=clientKey, alreadyRunning=alreadyRunning, params=params, minimumWorkers=minimumWorkers, maximumWorkers=maximumWorkers, jobType=ClientJobsDAO.JOB_TYPE_HS)
def createAndStartSwarm(client, clientInfo="", clientKey="", params="", minimumWorkers=None, maximumWorkers=None, alreadyRunning=False): """Create and start a swarm job. Args: client - A string identifying the calling client. There is a small limit for the length of the value. See ClientJobsDAO.CLIENT_MAX_LEN. clientInfo - JSON encoded dict of client specific information. clientKey - Foreign key. Limited in length, see ClientJobsDAO._initTables. params - JSON encoded dict of the parameters for the job. This can be fetched out of the database by the worker processes based on the jobID. minimumWorkers - The minimum workers to allocate to the swarm. Set to None to use the default. maximumWorkers - The maximum workers to allocate to the swarm. Set to None to use the swarm default. Set to 0 to use the maximum scheduler value. alreadyRunning - Insert a job record for an already running process. Used for testing. """ if minimumWorkers is None: minimumWorkers = Configuration.getInt( "nupic.hypersearch.minWorkersPerSwarm") if maximumWorkers is None: maximumWorkers = Configuration.getInt( "nupic.hypersearch.maxWorkersPerSwarm") return ClientJobsDAO.get().jobInsert( client=client, cmdLine="$HYPERSEARCH", clientInfo=clientInfo, clientKey=clientKey, alreadyRunning=alreadyRunning, params=params, minimumWorkers=minimumWorkers, maximumWorkers=maximumWorkers, jobType=ClientJobsDAO.JOB_TYPE_HS)
def main(argv): """ The main function of the HypersearchWorker script. This parses the command line arguments, instantiates a HypersearchWorker instance, and then runs it. Parameters: ---------------------------------------------------------------------- retval: jobID of the job we ran. This is used by unit test code when calling this working using the --params command line option (which tells this worker to insert the job itself). """ parser = OptionParser(helpString) parser.add_option( "--jobID", action="store", type="int", default=None, help="jobID of the job within the dbTable [default: %default].", ) parser.add_option( "--modelID", action="store", type="str", default=None, help=( "Tell worker to re-run this model ID. When specified, jobID " "must also be specified [default: %default]." ), ) parser.add_option( "--workerID", action="store", type="str", default=None, help=( "workerID of the scheduler's SlotAgent (GenericWorker) that " "hosts this SpecializedWorker [default: %default]." ), ) parser.add_option( "--params", action="store", default=None, help="Create and execute a new hypersearch request using this JSON " "format params string. This is helpful for unit tests and debugging. " "When specified jobID must NOT be specified. [default: %default].", ) parser.add_option( "--clearModels", action="store_true", default=False, help="clear out the models table before starting [default: %default].", ) parser.add_option( "--resetJobStatus", action="store_true", default=False, help="Reset the job status before starting [default: %default].", ) parser.add_option( "--logLevel", action="store", type="int", default=None, help="override default log level. Pass in an integer value that " "represents the desired logging level (10=logging.DEBUG, " "20=logging.INFO, etc.) [default: %default].", ) # Evaluate command line arguments (options, args) = parser.parse_args(argv[1:]) if len(args) != 0: raise RuntimeError("Expected no command line arguments, but got: %s" % (args)) if options.jobID and options.params: raise RuntimeError("--jobID and --params can not be used at the same time") if options.jobID is None and options.params is None: raise RuntimeError("Either --jobID or --params must be specified.") initLogging(verbose=True) # Instantiate the HypersearchWorker and run it hst = HypersearchWorker(options, argv[1:]) # Normal use. This is one of among a number of workers. If we encounter # an exception at the outer loop here, we fail the entire job. if options.params is None: try: jobID = hst.run() except Exception, e: jobID = options.jobID msg = StringIO.StringIO() print >> msg, "%s: Exception occurred in Hypersearch Worker: %r" % (ErrorCodes.hypersearchLogicErr, e) traceback.print_exc(None, msg) completionReason = ClientJobsDAO.CMPL_REASON_ERROR completionMsg = msg.getvalue() hst.logger.error(completionMsg) # If no other worker already marked the job as failed, do so now. jobsDAO = ClientJobsDAO.get() workerCmpReason = jobsDAO.jobGetFields(options.jobID, ["workerCompletionReason"])[0] if workerCmpReason == ClientJobsDAO.CMPL_REASON_SUCCESS: jobsDAO.jobSetFields( options.jobID, fields=dict( cancel=True, workerCompletionReason=ClientJobsDAO.CMPL_REASON_ERROR, workerCompletionMsg=completionMsg, ), useConnectionID=False, ignoreUnchanged=True, )
def run(self): """ Run this worker. Parameters: ---------------------------------------------------------------------- retval: jobID of the job we ran. This is used by unit test code when calling this working using the --params command line option (which tells this worker to insert the job itself). """ # Easier access to options options = self._options # --------------------------------------------------------------------- # Connect to the jobs database self.logger.info("Connecting to the jobs database") cjDAO = ClientJobsDAO.get() # Get our worker ID self._workerID = cjDAO.getConnectionID() if options.clearModels: cjDAO.modelsClearAll() # ------------------------------------------------------------------------- # if params were specified on the command line, insert a new job using # them. if options.params is not None: options.jobID = cjDAO.jobInsert( client="hwTest", cmdLine="echo 'test mode'", params=options.params, alreadyRunning=True, minimumWorkers=1, maximumWorkers=1, jobType=cjDAO.JOB_TYPE_HS, ) if options.workerID is not None: wID = options.workerID else: wID = self._workerID buildID = Configuration.get("nupic.software.buildNumber", "N/A") logPrefix = "<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> " % (buildID, wID, options.jobID) ExtendedLogger.setLogPrefix(logPrefix) # --------------------------------------------------------------------- # Get the search parameters # If asked to reset the job status, do that now if options.resetJobStatus: cjDAO.jobSetFields( options.jobID, fields={ "workerCompletionReason": ClientJobsDAO.CMPL_REASON_SUCCESS, "cancel": False, #'engWorkerState': None }, useConnectionID=False, ignoreUnchanged=True, ) jobInfo = cjDAO.jobInfo(options.jobID) self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo)))) # --------------------------------------------------------------------- # Instantiate the Hypersearch object, which will handle the logic of # which models to create when we need more to evaluate. jobParams = json.loads(jobInfo.params) # Validate job params jsonSchemaPath = os.path.join(os.path.dirname(__file__), "jsonschema", "jobParamsSchema.json") validate(jobParams, schemaPath=jsonSchemaPath) hsVersion = jobParams.get("hsVersion", None) if hsVersion == "v2": self._hs = HypersearchV2( searchParams=jobParams, workerID=self._workerID, cjDAO=cjDAO, jobID=options.jobID, logLevel=options.logLevel, ) else: raise RuntimeError("Invalid Hypersearch implementation (%s) specified" % (hsVersion)) # ===================================================================== # The main loop. try: exit = False numModelsTotal = 0 print >>sys.stderr, "reporter:status:Evaluating first model..." while not exit: # ------------------------------------------------------------------ # Choose a model to evaluate batchSize = 10 # How many to try at a time. modelIDToRun = None while modelIDToRun is None: if options.modelID is None: # ----------------------------------------------------------------- # Get the latest results on all running models and send them to # the Hypersearch implementation # This calls cjDAO.modelsGetUpdateCounters(), compares the # updateCounters with what we have cached, fetches the results for the # changed and new models, and sends those to the Hypersearch # implementation's self._hs.recordModelProgress() method. self._processUpdatedModels(cjDAO) # -------------------------------------------------------------------- # Create a new batch of models (exit, newModels) = self._hs.createModels(numModels=batchSize) if exit: break # No more models left to create, just loop. The _hs is waiting for # all remaining running models to complete, and may pick up on an # orphan if it detects one. if len(newModels) == 0: continue # Try and insert one that we will run for (modelParams, modelParamsHash, particleHash) in newModels: jsonModelParams = json.dumps(modelParams) (modelID, ours) = cjDAO.modelInsertAndStart( options.jobID, jsonModelParams, modelParamsHash, particleHash ) # Some other worker is already running it, tell the Hypersearch object # so that it doesn't try and insert it again if not ours: mParamsAndHash = cjDAO.modelsGetParams([modelID])[0] mResult = cjDAO.modelsGetResultAndStatus([modelID])[0] results = mResult.results if results is not None: results = json.loads(results) modelParams = json.loads(mParamsAndHash.params) particleHash = cjDAO.modelsGetFields(modelID, ["engParticleHash"])[0] particleInst = "%s.%s" % ( modelParams["particleState"]["id"], modelParams["particleState"]["genIdx"], ) self.logger.info( "Adding model %d to our internal DB " "because modelInsertAndStart() failed to insert it: " "paramsHash=%s, particleHash=%s, particleId='%s'", modelID, mParamsAndHash.engParamsHash.encode("hex"), particleHash.encode("hex"), particleInst, ) self._hs.recordModelProgress( modelID=modelID, modelParams=modelParams, modelParamsHash=mParamsAndHash.engParamsHash, results=results, completed=(mResult.status == cjDAO.STATUS_COMPLETED), completionReason=mResult.completionReason, matured=mResult.engMatured, numRecords=mResult.numRecords, ) else: modelIDToRun = modelID break else: # A specific modelID was passed on the command line modelIDToRun = int(options.modelID) mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun])[0] modelParams = json.loads(mParamsAndHash.params) modelParamsHash = mParamsAndHash.engParamsHash # Make us the worker cjDAO.modelSetFields(modelIDToRun, dict(engWorkerConnId=self._workerID)) if False: # Change the hash and params of the old entry so that we can # create a new model with the same params for attempt in range(1000): paramsHash = hashlib.md5("OrphanParams.%d.%d" % (modelIDToRun, attempt)).digest() particleHash = hashlib.md5("OrphanParticle.%d.%d" % (modelIDToRun, attempt)).digest() try: cjDAO.modelSetFields( modelIDToRun, dict(engParamsHash=paramsHash, engParticleHash=particleHash) ) success = True except: success = False if success: break if not success: raise RuntimeError( "Unexpected failure to change paramsHash and " "particleHash of orphaned model" ) (modelIDToRun, ours) = cjDAO.modelInsertAndStart( options.jobID, mParamsAndHash.params, modelParamsHash ) # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # --------------------------------------------------------------- # We have a model, evaluate it now # All done? if exit: break # Run the model now self.logger.info( "RUNNING MODEL GID=%d, paramsHash=%s, params=%s", modelIDToRun, modelParamsHash.encode("hex"), modelParams, ) # --------------------------------------------------------------------- # Construct model checkpoint GUID for this model: # jobParams['persistentJobGUID'] contains the client's (e.g., API Server) # persistent, globally-unique model identifier, which is what we need; persistentJobGUID = jobParams["persistentJobGUID"] assert persistentJobGUID, "persistentJobGUID: %r" % (persistentJobGUID,) modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + ("_" + str(modelIDToRun)) self._hs.runModel( modelID=modelIDToRun, jobID=options.jobID, modelParams=modelParams, modelParamsHash=modelParamsHash, jobsDAO=cjDAO, modelCheckpointGUID=modelCheckpointGUID, ) # TODO: don't increment for orphaned models numModelsTotal += 1 self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs", modelIDToRun, numModelsTotal) print >>sys.stderr, "reporter:status:Evaluated %d models..." % (numModelsTotal) print >>sys.stderr, "reporter:counter:HypersearchWorker,numModels,1" if options.modelID is not None: exit = True # ^^^ end while not exit finally: # Provide Hypersearch instance an opportunity to clean up temporary files self._hs.close() self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal)) print >>sys.stderr, "reporter:status:Finished, evaluated %d models" % (numModelsTotal) return options.jobID
# 1 process else: jobID = None completionReason = ClientJobsDAO.CMPL_REASON_SUCCESS completionMsg = "Success" try: jobID = hst.run() except Exception, e: jobID = hst._options.jobID completionReason = ClientJobsDAO.CMPL_REASON_ERROR completionMsg = "ERROR: %s" % (e,) raise finally: if jobID is not None: cjDAO = ClientJobsDAO.get() cjDAO.jobSetCompleted(jobID=jobID, completionReason=completionReason, completionMsg=completionMsg) return jobID if __name__ == "__main__": logging.setLoggerClass(ExtendedLogger) buildID = Configuration.get("nupic.software.buildNumber", "N/A") logPrefix = "<BUILDID=%s, WORKER=HS, WRKID=N/A, JOBID=N/A> " % buildID ExtendedLogger.setLogPrefix(logPrefix) try: main(sys.argv) except: logging.exception("HypersearchWorker is exiting with unhandled exception; " "argv=%r", sys.argv)
def main(argv): """ The main function of the HypersearchWorker script. This parses the command line arguments, instantiates a HypersearchWorker instance, and then runs it. Parameters: ---------------------------------------------------------------------- retval: jobID of the job we ran. This is used by unit test code when calling this working using the --params command line option (which tells this worker to insert the job itself). """ parser = OptionParser(helpString) parser.add_option("--jobID", action="store", type="int", default=None, help="jobID of the job within the dbTable [default: %default].") parser.add_option("--modelID", action="store", type="str", default=None, help=("Tell worker to re-run this model ID. When specified, jobID " "must also be specified [default: %default].")) parser.add_option("--workerID", action="store", type="str", default=None, help=("workerID of the scheduler's SlotAgent (GenericWorker) that " "hosts this SpecializedWorker [default: %default].")) parser.add_option("--params", action="store", default=None, help="Create and execute a new hypersearch request using this JSON " \ "format params string. This is helpful for unit tests and debugging. " \ "When specified jobID must NOT be specified. [default: %default].") parser.add_option("--clearModels", action="store_true", default=False, help="clear out the models table before starting [default: %default].") parser.add_option("--resetJobStatus", action="store_true", default=False, help="Reset the job status before starting [default: %default].") parser.add_option("--logLevel", action="store", type="int", default=None, help="override default log level. Pass in an integer value that " "represents the desired logging level (10=logging.DEBUG, " "20=logging.INFO, etc.) [default: %default].") # Evaluate command line arguments (options, args) = parser.parse_args(argv[1:]) if len(args) != 0: raise RuntimeError("Expected no command line arguments, but got: %s" % \ (args)) if (options.jobID and options.params): raise RuntimeError("--jobID and --params can not be used at the same time") if (options.jobID is None and options.params is None): raise RuntimeError("Either --jobID or --params must be specified.") initLogging(verbose=True) # Instantiate the HypersearchWorker and run it hst = HypersearchWorker(options, argv[1:]) # Normal use. This is one of among a number of workers. If we encounter # an exception at the outer loop here, we fail the entire job. if options.params is None: try: jobID = hst.run() except Exception, e: jobID = options.jobID msg = StringIO.StringIO() print >>msg, "%s: Exception occurred in Hypersearch Worker: %r" % \ (ErrorCodes.hypersearchLogicErr, e) traceback.print_exc(None, msg) completionReason = ClientJobsDAO.CMPL_REASON_ERROR completionMsg = msg.getvalue() hst.logger.error(completionMsg) # If no other worker already marked the job as failed, do so now. jobsDAO = ClientJobsDAO.get() workerCmpReason = jobsDAO.jobGetFields(options.jobID, ['workerCompletionReason'])[0] if workerCmpReason == ClientJobsDAO.CMPL_REASON_SUCCESS: jobsDAO.jobSetFields(options.jobID, fields=dict( cancel=True, workerCompletionReason = ClientJobsDAO.CMPL_REASON_ERROR, workerCompletionMsg = completionMsg), useConnectionID=False, ignoreUnchanged=True)
def run(self): """ Run this worker. Parameters: ---------------------------------------------------------------------- retval: jobID of the job we ran. This is used by unit test code when calling this working using the --params command line option (which tells this worker to insert the job itself). """ # Easier access to options options = self._options # --------------------------------------------------------------------- # Connect to the jobs database self.logger.info("Connecting to the jobs database") cjDAO = ClientJobsDAO.get() # Get our worker ID self._workerID = cjDAO.getConnectionID() if options.clearModels: cjDAO.modelsClearAll() # ------------------------------------------------------------------------- # if params were specified on the command line, insert a new job using # them. if options.params is not None: options.jobID = cjDAO.jobInsert(client='hwTest', cmdLine="echo 'test mode'", params=options.params, alreadyRunning=True, minimumWorkers=1, maximumWorkers=1, jobType = cjDAO.JOB_TYPE_HS) if options.workerID is not None: wID = options.workerID else: wID = self._workerID buildID = Configuration.get('nupic.software.buildNumber', 'N/A') logPrefix = '<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> ' % \ (buildID, wID, options.jobID) ExtendedLogger.setLogPrefix(logPrefix) # --------------------------------------------------------------------- # Get the search parameters # If asked to reset the job status, do that now if options.resetJobStatus: cjDAO.jobSetFields(options.jobID, fields={'workerCompletionReason': ClientJobsDAO.CMPL_REASON_SUCCESS, 'cancel': False, #'engWorkerState': None }, useConnectionID=False, ignoreUnchanged=True) jobInfo = cjDAO.jobInfo(options.jobID) self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo)))) # --------------------------------------------------------------------- # Instantiate the Hypersearch object, which will handle the logic of # which models to create when we need more to evaluate. jobParams = json.loads(jobInfo.params) # Validate job params jsonSchemaPath = os.path.join(os.path.dirname(__file__), "jsonschema", "jobParamsSchema.json") validate(jobParams, schemaPath=jsonSchemaPath) hsVersion = jobParams.get('hsVersion', None) if hsVersion == 'v2': self._hs = HypersearchV2(searchParams=jobParams, workerID=self._workerID, cjDAO=cjDAO, jobID=options.jobID, logLevel=options.logLevel) else: raise RuntimeError("Invalid Hypersearch implementation (%s) specified" \ % (hsVersion)) # ===================================================================== # The main loop. try: exit = False numModelsTotal = 0 print >>sys.stderr, "reporter:status:Evaluating first model..." while not exit: # ------------------------------------------------------------------ # Choose a model to evaluate batchSize = 10 # How many to try at a time. modelIDToRun = None while modelIDToRun is None: if options.modelID is None: # ----------------------------------------------------------------- # Get the latest results on all running models and send them to # the Hypersearch implementation # This calls cjDAO.modelsGetUpdateCounters(), compares the # updateCounters with what we have cached, fetches the results for the # changed and new models, and sends those to the Hypersearch # implementation's self._hs.recordModelProgress() method. self._processUpdatedModels(cjDAO) # -------------------------------------------------------------------- # Create a new batch of models (exit, newModels) = self._hs.createModels(numModels = batchSize) if exit: break # No more models left to create, just loop. The _hs is waiting for # all remaining running models to complete, and may pick up on an # orphan if it detects one. if len(newModels) == 0: continue # Try and insert one that we will run for (modelParams, modelParamsHash, particleHash) in newModels: jsonModelParams = json.dumps(modelParams) (modelID, ours) = cjDAO.modelInsertAndStart(options.jobID, jsonModelParams, modelParamsHash, particleHash) # Some other worker is already running it, tell the Hypersearch object # so that it doesn't try and insert it again if not ours: mParamsAndHash = cjDAO.modelsGetParams([modelID])[0] mResult = cjDAO.modelsGetResultAndStatus([modelID])[0] results = mResult.results if results is not None: results = json.loads(results) modelParams = json.loads(mParamsAndHash.params) particleHash = cjDAO.modelsGetFields(modelID, ['engParticleHash'])[0] particleInst = "%s.%s" % ( modelParams['particleState']['id'], modelParams['particleState']['genIdx']) self.logger.info("Adding model %d to our internal DB " \ "because modelInsertAndStart() failed to insert it: " \ "paramsHash=%s, particleHash=%s, particleId='%s'", modelID, mParamsAndHash.engParamsHash.encode('hex'), particleHash.encode('hex'), particleInst) self._hs.recordModelProgress(modelID = modelID, modelParams = modelParams, modelParamsHash = mParamsAndHash.engParamsHash, results = results, completed = (mResult.status == cjDAO.STATUS_COMPLETED), completionReason = mResult.completionReason, matured = mResult.engMatured, numRecords = mResult.numRecords) else: modelIDToRun = modelID break else: # A specific modelID was passed on the command line modelIDToRun = int(options.modelID) mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun])[0] modelParams = json.loads(mParamsAndHash.params) modelParamsHash = mParamsAndHash.engParamsHash # Make us the worker cjDAO.modelSetFields(modelIDToRun, dict(engWorkerConnId=self._workerID)) if False: # Change the hash and params of the old entry so that we can # create a new model with the same params for attempt in range(1000): paramsHash = hashlib.md5("OrphanParams.%d.%d" % (modelIDToRun, attempt)).digest() particleHash = hashlib.md5("OrphanParticle.%d.%d" % (modelIDToRun, attempt)).digest() try: cjDAO.modelSetFields(modelIDToRun, dict(engParamsHash=paramsHash, engParticleHash=particleHash)) success = True except: success = False if success: break if not success: raise RuntimeError("Unexpected failure to change paramsHash and " "particleHash of orphaned model") (modelIDToRun, ours) = cjDAO.modelInsertAndStart(options.jobID, mParamsAndHash.params, modelParamsHash) # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # --------------------------------------------------------------- # We have a model, evaluate it now # All done? if exit: break # Run the model now self.logger.info("RUNNING MODEL GID=%d, paramsHash=%s, params=%s", modelIDToRun, modelParamsHash.encode('hex'), modelParams) # --------------------------------------------------------------------- # Construct model checkpoint GUID for this model: # jobParams['persistentJobGUID'] contains the client's (e.g., API Server) # persistent, globally-unique model identifier, which is what we need; persistentJobGUID = jobParams['persistentJobGUID'] assert persistentJobGUID, "persistentJobGUID: %r" % (persistentJobGUID,) modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + ( '_' + str(modelIDToRun)) self._hs.runModel(modelID=modelIDToRun, jobID = options.jobID, modelParams=modelParams, modelParamsHash=modelParamsHash, jobsDAO=cjDAO, modelCheckpointGUID=modelCheckpointGUID) # TODO: don't increment for orphaned models numModelsTotal += 1 self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs", modelIDToRun, numModelsTotal) print >>sys.stderr, "reporter:status:Evaluated %d models..." % \ (numModelsTotal) print >>sys.stderr, "reporter:counter:HypersearchWorker,numModels,1" if options.modelID is not None: exit = True # ^^^ end while not exit finally: # Provide Hypersearch instance an opportunity to clean up temporary files self._hs.close() self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal)) print >>sys.stderr, "reporter:status:Finished, evaluated %d models" % (numModelsTotal) return options.jobID
# 1 process else: jobID = None completionReason = ClientJobsDAO.CMPL_REASON_SUCCESS completionMsg = "Success" try: jobID = hst.run() except Exception, e: jobID = hst._options.jobID completionReason = ClientJobsDAO.CMPL_REASON_ERROR completionMsg = "ERROR: %s" % (e,) raise finally: if jobID is not None: cjDAO = ClientJobsDAO.get() cjDAO.jobSetCompleted(jobID=jobID, completionReason=completionReason, completionMsg=completionMsg) return jobID if __name__ == "__main__": logging.setLoggerClass(ExtendedLogger) buildID = Configuration.get('nupic.software.buildNumber', 'N/A') logPrefix = '<BUILDID=%s, WORKER=HS, WRKID=N/A, JOBID=N/A> ' % buildID ExtendedLogger.setLogPrefix(logPrefix) try:
def run(self): """ Run this worker. Parameters: ---------------------------------------------------------------------- retval: jobID of the job we ran. This is used by unit test code when calling this working using the --params command line option (which tells this worker to insert the job itself). """ # Easier access to options options = self._options # --------------------------------------------------------------------- # Connect to the jobs database self.logger.info("Connecting to the jobs database") cjDAO = ClientJobsDAO.get() # Get our worker ID self._workerID = cjDAO.getConnectionID() # ------------------------------------------------------------------------- # if params were specified on the command line, insert a new job using # them. if options.params is not None: options.jobID = cjDAO.jobInsert(client='dummy', cmdLine="python -m nupic.swarming.DummyWorker --jobID={JOBID}", params=options.params) # --------------------------------------------------------------------- # Get the search parameters jobInfo = cjDAO.jobInfo(options.jobID) self.logger.info("Job info retrieved: %s" % (str(jobInfo))) if options.workerID is not None: wID = options.workerID else: wID = self._workerID buildID = Configuration.get('nupic.software.buildNumber', 'N/A') logPrefix = '<BUILDID=%s, WORKER=DW, WRKID=%s, JOBID=%s> ' % \ (buildID, wID, options.jobID) ExtendedLogger.setLogPrefix(logPrefix) # --------------------------------------------------------------------- # Instantiate the Dummy object, which will handle the logic of # which models to create when we need more to evaluate. jobParams = json.loads(jobInfo.params) self.logger.info("Job Params: %s" % jobInfo.params) # prints the current status print >>sys.stderr, "reporter:status:Running dummy worker on job:%d" % \ (options.jobID) self.logger.info("Start of the dummy worker") startTime = time.time() runTime = jobParams['runTime'] jobLoad = jobParams['load'] crashJob = jobParams['crash'] try: while True: if runTime != -1 and time.time() > startTime + runTime: break self.logger.info("In dummy worker") if jobLoad == 'heavy': # Computationally intensive process # Takes 0.8 sec approximately numIterations = 30000 for i in range(numIterations): d = numpy.random.rand(1000).sum() else: time.sleep(0.8) except: self.logger.exception("DummyWorker exception;") if crashJob: self.logger.info("Crash of the dummy worker") print >>sys.stderr, "reporter:status:Crashed dummy worker..." raise RuntimeError("Simulating job crash.") else: self.logger.info("End of the dummy worker") print >>sys.stderr, "reporter:status:Finished dummy worker..." #import auxilary #auxilary.do_something() return options.jobID