Beispiel #1
0
def getSwarmModelParams(modelID):
    """Retrieve the Engine-level model params from a Swarm model
  
  Args:
    modelID - Engine-level model ID of the Swarm model
  
  Returns:
    JSON-encoded string containing Model Params
  """

    # TODO: the use of opfhelpers.loadExperimentDescriptionScriptFromDir when
    #  retrieving module params results in a leakage of pf_base_descriptionNN and
    #  pf_descriptionNN module imports for every call to getSwarmModelParams, so
    #  the leakage is unlimited when getSwarmModelParams is called by a
    #  long-running process. This issue is presently being
    #  tracked by the JIRA: https://issues.numenta.org/browse/NPC-225. An
    #  alternate solution is to execute the guts of this function's logic in a
    #  seprate process (via multiprocessing module).

    cjDAO = ClientJobsDAO.get()

    (jobID, description) = cjDAO.modelsGetFields(modelID,
                                                 ["jobId", "genDescription"])

    (baseDescription, ) = cjDAO.jobGetFields(jobID, ["genBaseDescription"])

    # Construct a directory with base.py and description.py for loading model
    # params, and use opfhelpers to extract model params from those files
    descriptionDirectory = tempfile.mkdtemp()
    try:
        baseDescriptionFilePath = os.path.join(descriptionDirectory, "base.py")
        with open(baseDescriptionFilePath, mode="wb") as f:
            f.write(baseDescription)

        descriptionFilePath = os.path.join(descriptionDirectory,
                                           "description.py")
        with open(descriptionFilePath, mode="wb") as f:
            f.write(description)

        expIface = opfhelpers.getExperimentDescriptionInterfaceFromModule(
            opfhelpers.loadExperimentDescriptionScriptFromDir(
                descriptionDirectory))

        return json.dumps(
            dict(modelConfig=expIface.getModelDescription(),
                 inferenceArgs=expIface.getModelControl().get(
                     "inferenceArgs", None)))
    finally:
        shutil.rmtree(descriptionDirectory, ignore_errors=True)
Beispiel #2
0
def getSwarmModelParams(modelID):
  """Retrieve the Engine-level model params from a Swarm model
  
  Args:
    modelID - Engine-level model ID of the Swarm model
  
  Returns:
    JSON-encoded string containing Model Params
  """
  
  # TODO: the use of opfhelpers.loadExperimentDescriptionScriptFromDir when
  #  retrieving module params results in a leakage of pf_base_descriptionNN and
  #  pf_descriptionNN module imports for every call to getSwarmModelParams, so
  #  the leakage is unlimited when getSwarmModelParams is called by a
  #  long-running process such as grok-api-server. This issue is presently being
  #  tracked by the JIRA: https://issues.numenta.org/browse/NPC-225. An
  #  alternate solution is to execute the guts of this function's logic in a
  #  seprate process (via multiprocessing module).
  
  cjDAO = ClientJobsDAO.get()
  
  (jobID, description) = cjDAO.modelsGetFields(
    modelID,
    ["jobId", "genDescription"])
  
  (baseDescription,) = cjDAO.jobGetFields(jobID, ["genBaseDescription"])
  
  # Construct a directory with base.py and description.py for loading model
  # params, and use opfhelpers to extract model params from those files
  descriptionDirectory = tempfile.mkdtemp()
  try:
    baseDescriptionFilePath = os.path.join(descriptionDirectory, "base.py")
    with open(baseDescriptionFilePath, mode="wb") as f:
      f.write(baseDescription)
    
    descriptionFilePath = os.path.join(descriptionDirectory, "description.py")
    with open(descriptionFilePath, mode="wb") as f:
      f.write(description)
    
    expIface = opfhelpers.getExperimentDescriptionInterfaceFromModule(
      opfhelpers.loadExperimentDescriptionScriptFromDir(descriptionDirectory))
    
    return json.dumps(
      dict(
        modelConfig=expIface.getModelDescription(),
        inferenceArgs=expIface.getModelControl().get("inferenceArgs", None)))
  finally:
    shutil.rmtree(descriptionDirectory, ignore_errors=True)
Beispiel #3
0
def createAndStartSwarm(client,
                        clientInfo="",
                        clientKey="",
                        params="",
                        minimumWorkers=None,
                        maximumWorkers=None,
                        alreadyRunning=False):
    """Create and start a swarm job.

  Args:
    client - A string identifying the calling client. There is a small limit
        for the length of the value. See ClientJobsDAO.CLIENT_MAX_LEN.
    clientInfo - JSON encoded dict of client specific information.
    clientKey - Foreign key. Limited in length, see ClientJobsDAO._initTables.
    params - JSON encoded dict of the parameters for the job. This can be
        fetched out of the database by the worker processes based on the jobID.
    minimumWorkers - The minimum workers to allocate to the swarm. Set to None
        to use the default.
    maximumWorkers - The maximum workers to allocate to the swarm. Set to None
        to use the swarm default. Set to 0 to use the maximum scheduler value.
    alreadyRunning - Insert a job record for an already running process. Used
        for testing.
  """
    if minimumWorkers is None:
        minimumWorkers = Configuration.getInt(
            "nupic.hypersearch.minWorkersPerSwarm")
    if maximumWorkers is None:
        maximumWorkers = Configuration.getInt(
            "nupic.hypersearch.maxWorkersPerSwarm")

    return ClientJobsDAO.get().jobInsert(client=client,
                                         cmdLine="$HYPERSEARCH",
                                         clientInfo=clientInfo,
                                         clientKey=clientKey,
                                         alreadyRunning=alreadyRunning,
                                         params=params,
                                         minimumWorkers=minimumWorkers,
                                         maximumWorkers=maximumWorkers,
                                         jobType=ClientJobsDAO.JOB_TYPE_HS)
def main(argv):
  """
  The main function of the HypersearchWorker script. This parses the command
  line arguments, instantiates a HypersearchWorker instance, and then
  runs it.

  Parameters:
  ----------------------------------------------------------------------
  retval:     jobID of the job we ran. This is used by unit test code
                when calling this working using the --params command
                line option (which tells this worker to insert the job
                itself).
  """

  parser = OptionParser(helpString)

  parser.add_option("--jobID", action="store", type="int", default=None,
        help="jobID of the job within the dbTable [default: %default].")

  parser.add_option("--modelID", action="store", type="str", default=None,
        help=("Tell worker to re-run this model ID. When specified, jobID "
         "must also be specified [default: %default]."))

  parser.add_option("--workerID", action="store", type="str", default=None,
        help=("workerID of the scheduler's SlotAgent (GenericWorker) that "
          "hosts this SpecializedWorker [default: %default]."))

  parser.add_option("--params", action="store", default=None,
        help="Create and execute a new hypersearch request using this JSON " \
        "format params string. This is helpful for unit tests and debugging. " \
        "When specified jobID must NOT be specified. [default: %default].")

  parser.add_option("--clearModels", action="store_true", default=False,
        help="clear out the models table before starting [default: %default].")

  parser.add_option("--resetJobStatus", action="store_true", default=False,
        help="Reset the job status before starting  [default: %default].")

  parser.add_option("--logLevel", action="store", type="int", default=None,
        help="override default log level. Pass in an integer value that "
        "represents the desired logging level (10=logging.DEBUG, "
        "20=logging.INFO, etc.) [default: %default].")

  # Evaluate command line arguments
  (options, args) = parser.parse_args(argv[1:])
  if len(args) != 0:
    raise RuntimeError("Expected no command line arguments, but got: %s" % \
                        (args))

  if (options.jobID and options.params):
    raise RuntimeError("--jobID and --params can not be used at the same time")

  if (options.jobID is None and options.params is None):
    raise RuntimeError("Either --jobID or --params must be specified.")

  initLogging(verbose=True)

  # Instantiate the HypersearchWorker and run it
  hst = HypersearchWorker(options, argv[1:])

  # Normal use. This is one of among a number of workers. If we encounter
  #  an exception at the outer loop here, we fail the entire job.
  if options.params is None:
    try:
      jobID = hst.run()

    except Exception, e:
      jobID = options.jobID
      msg = StringIO.StringIO()
      print >>msg, "%s: Exception occurred in Hypersearch Worker: %r" % \
         (ErrorCodes.hypersearchLogicErr, e)
      traceback.print_exc(None, msg)

      completionReason = ClientJobsDAO.CMPL_REASON_ERROR
      completionMsg = msg.getvalue()
      hst.logger.error(completionMsg)

      # If no other worker already marked the job as failed, do so now.
      jobsDAO = ClientJobsDAO.get()
      workerCmpReason = jobsDAO.jobGetFields(options.jobID,
          ['workerCompletionReason'])[0]
      if workerCmpReason == ClientJobsDAO.CMPL_REASON_SUCCESS:
        jobsDAO.jobSetFields(options.jobID, fields=dict(
            cancel=True,
            workerCompletionReason = ClientJobsDAO.CMPL_REASON_ERROR,
            workerCompletionMsg = completionMsg),
            useConnectionID=False,
            ignoreUnchanged=True)
  def run(self):
    """ Run this worker.

    Parameters:
    ----------------------------------------------------------------------
    retval:     jobID of the job we ran. This is used by unit test code
                  when calling this working using the --params command
                  line option (which tells this worker to insert the job
                  itself).
    """
    # Easier access to options
    options = self._options

    # ---------------------------------------------------------------------
    # Connect to the jobs database
    self.logger.info("Connecting to the jobs database")
    cjDAO = ClientJobsDAO.get()

    # Get our worker ID
    self._workerID = cjDAO.getConnectionID()

    if options.clearModels:
      cjDAO.modelsClearAll()

    # -------------------------------------------------------------------------
    # if params were specified on the command line, insert a new job using
    #  them.
    if options.params is not None:
      options.jobID = cjDAO.jobInsert(client='hwTest', cmdLine="echo 'test mode'",
                  params=options.params, alreadyRunning=True,
                  minimumWorkers=1, maximumWorkers=1,
                  jobType = cjDAO.JOB_TYPE_HS)
    if options.workerID is not None:
      wID = options.workerID
    else:
      wID = self._workerID
    
    buildID = Configuration.get('nupic.software.buildNumber', 'N/A')
    logPrefix = '<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> ' % \
                (buildID, wID, options.jobID)
    ExtendedLogger.setLogPrefix(logPrefix)

    # ---------------------------------------------------------------------
    # Get the search parameters
    # If asked to reset the job status, do that now
    if options.resetJobStatus:
      cjDAO.jobSetFields(options.jobID,
           fields={'workerCompletionReason': ClientJobsDAO.CMPL_REASON_SUCCESS,
                   'cancel': False,
                   #'engWorkerState': None
                   },
           useConnectionID=False,
           ignoreUnchanged=True)
    jobInfo = cjDAO.jobInfo(options.jobID)
    self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo))))


    # ---------------------------------------------------------------------
    # Instantiate the Hypersearch object, which will handle the logic of
    #  which models to create when we need more to evaluate.
    jobParams = json.loads(jobInfo.params)

    # Validate job params
    jsonSchemaPath = os.path.join(os.path.dirname(__file__),
                                  "jsonschema",
                                  "jobParamsSchema.json")
    validate(jobParams, schemaPath=jsonSchemaPath)


    hsVersion = jobParams.get('hsVersion', None)
    if hsVersion == 'v2':
      self._hs = HypersearchV2(searchParams=jobParams, workerID=self._workerID,
              cjDAO=cjDAO, jobID=options.jobID, logLevel=options.logLevel)
    else:
      raise RuntimeError("Invalid Hypersearch implementation (%s) specified" \
                          % (hsVersion))


    # =====================================================================
    # The main loop.
    try:
      exit = False
      numModelsTotal = 0
      print >>sys.stderr, "reporter:status:Evaluating first model..."
      while not exit:

        # ------------------------------------------------------------------
        # Choose a model to evaluate
        batchSize = 10              # How many to try at a time.
        modelIDToRun = None
        while modelIDToRun is None:

          if options.modelID is None:
            # -----------------------------------------------------------------
            # Get the latest results on all running models and send them to
            #  the Hypersearch implementation
            # This calls cjDAO.modelsGetUpdateCounters(), compares the
            # updateCounters with what we have cached, fetches the results for the
            # changed and new models, and sends those to the Hypersearch
            # implementation's self._hs.recordModelProgress() method.
            self._processUpdatedModels(cjDAO)
  
            # --------------------------------------------------------------------
            # Create a new batch of models
            (exit, newModels) = self._hs.createModels(numModels = batchSize)
            if exit:
              break

            # No more models left to create, just loop. The _hs is waiting for
            #   all remaining running models to complete, and may pick up on an
            #  orphan if it detects one.
            if len(newModels) == 0:
              continue
  
            # Try and insert one that we will run
            for (modelParams, modelParamsHash, particleHash) in newModels:
              jsonModelParams = json.dumps(modelParams)
              (modelID, ours) = cjDAO.modelInsertAndStart(options.jobID,
                                  jsonModelParams, modelParamsHash, particleHash)
  
              # Some other worker is already running it, tell the Hypersearch object
              #  so that it doesn't try and insert it again
              if not ours:
                mParamsAndHash = cjDAO.modelsGetParams([modelID])[0]
                mResult = cjDAO.modelsGetResultAndStatus([modelID])[0]
                results = mResult.results
                if results is not None:
                  results = json.loads(results)
  
                modelParams = json.loads(mParamsAndHash.params)
                particleHash = cjDAO.modelsGetFields(modelID, 
                                  ['engParticleHash'])[0]
                particleInst = "%s.%s" % (
                          modelParams['particleState']['id'],
                          modelParams['particleState']['genIdx'])
                self.logger.info("Adding model %d to our internal DB " \
                      "because modelInsertAndStart() failed to insert it: " \
                      "paramsHash=%s, particleHash=%s, particleId='%s'", modelID, 
                      mParamsAndHash.engParamsHash.encode('hex'),
                      particleHash.encode('hex'), particleInst)
                self._hs.recordModelProgress(modelID = modelID,
                      modelParams = modelParams,
                      modelParamsHash = mParamsAndHash.engParamsHash,
                      results = results,
                      completed = (mResult.status == cjDAO.STATUS_COMPLETED),
                      completionReason = mResult.completionReason,
                      matured = mResult.engMatured,
                      numRecords = mResult.numRecords)
              else:
                modelIDToRun = modelID
                break
  
          else:
            # A specific modelID was passed on the command line
            modelIDToRun = int(options.modelID)
            mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun])[0]
            modelParams = json.loads(mParamsAndHash.params)
            modelParamsHash = mParamsAndHash.engParamsHash
            
            # Make us the worker
            cjDAO.modelSetFields(modelIDToRun,
                                     dict(engWorkerConnId=self._workerID))
            if False:
              # Change the hash and params of the old entry so that we can
              #  create a new model with the same params
              for attempt in range(1000):
                paramsHash = hashlib.md5("OrphanParams.%d.%d" % (modelIDToRun,
                                                                 attempt)).digest()
                particleHash = hashlib.md5("OrphanParticle.%d.%d" % (modelIDToRun,
                                                                  attempt)).digest()
                try:
                  cjDAO.modelSetFields(modelIDToRun,
                                           dict(engParamsHash=paramsHash,
                                                engParticleHash=particleHash))
                  success = True
                except:
                  success = False
                if success:
                  break
              if not success:
                raise RuntimeError("Unexpected failure to change paramsHash and "
                                   "particleHash of orphaned model")
              
              (modelIDToRun, ours) = cjDAO.modelInsertAndStart(options.jobID,
                                  mParamsAndHash.params, modelParamsHash)

            
            
            # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

        # ---------------------------------------------------------------
        # We have a model, evaluate it now
        # All done?
        if exit:
          break

        # Run the model now
        self.logger.info("RUNNING MODEL GID=%d, paramsHash=%s, params=%s",
              modelIDToRun, modelParamsHash.encode('hex'), modelParams)

        # ---------------------------------------------------------------------
        # Construct model checkpoint GUID for this model:
        # jobParams['persistentJobGUID'] contains the client's (e.g., API Server)
        # persistent, globally-unique model identifier, which is what we need;
        persistentJobGUID = jobParams['persistentJobGUID']
        assert persistentJobGUID, "persistentJobGUID: %r" % (persistentJobGUID,)

        modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + (
          '_' + str(modelIDToRun))


        self._hs.runModel(modelID=modelIDToRun, jobID = options.jobID,
                          modelParams=modelParams, modelParamsHash=modelParamsHash,
                          jobsDAO=cjDAO, modelCheckpointGUID=modelCheckpointGUID)

        # TODO: don't increment for orphaned models
        numModelsTotal += 1

        self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs",
          modelIDToRun, numModelsTotal)
        print >>sys.stderr, "reporter:status:Evaluated %d models..." % \
                                    (numModelsTotal)
        print >>sys.stderr, "reporter:counter:HypersearchWorker,numModels,1"

        if options.modelID is not None:
          exit = True
        # ^^^ end while not exit

    finally:
      # Provide Hypersearch instance an opportunity to clean up temporary files
      self._hs.close()

    self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal))
    print >>sys.stderr, "reporter:status:Finished, evaluated %d models" % (numModelsTotal)
    return options.jobID
  # 1 process
  else:
    jobID = None
    completionReason = ClientJobsDAO.CMPL_REASON_SUCCESS
    completionMsg = "Success"

    try:
      jobID = hst.run()
    except Exception, e:
      jobID = hst._options.jobID
      completionReason = ClientJobsDAO.CMPL_REASON_ERROR
      completionMsg = "ERROR: %s" % (e,)
      raise
    finally:
      if jobID is not None:
        cjDAO = ClientJobsDAO.get()
        cjDAO.jobSetCompleted(jobID=jobID,
                              completionReason=completionReason,
                              completionMsg=completionMsg)

  return jobID



if __name__ == "__main__":
  logging.setLoggerClass(ExtendedLogger)
  buildID = Configuration.get('nupic.software.buildNumber', 'N/A')
  logPrefix = '<BUILDID=%s, WORKER=HS, WRKID=N/A, JOBID=N/A> ' % buildID
  ExtendedLogger.setLogPrefix(logPrefix)
  
  try:
Beispiel #7
0
  def run(self):
    """ Run this worker.

    Parameters:
    ----------------------------------------------------------------------
    retval:     jobID of the job we ran. This is used by unit test code
                  when calling this working using the --params command
                  line option (which tells this worker to insert the job
                  itself).
    """
    # Easier access to options
    options = self._options

    # ---------------------------------------------------------------------
    # Connect to the jobs database
    self.logger.info("Connecting to the jobs database")
    cjDAO = ClientJobsDAO.get()

    # Get our worker ID
    self._workerID = cjDAO.getConnectionID()


    # -------------------------------------------------------------------------
    # if params were specified on the command line, insert a new job using
    #  them.
    if options.params is not None:
      options.jobID = cjDAO.jobInsert(client='dummy',
                  cmdLine="python -m nupic.swarming.DummyWorker --jobID={JOBID}",
                  params=options.params)



    # ---------------------------------------------------------------------
    # Get the search parameters
    jobInfo = cjDAO.jobInfo(options.jobID)
    self.logger.info("Job info retrieved: %s" % (str(jobInfo)))
    if options.workerID is not None:
      wID = options.workerID
    else:
      wID = self._workerID
    
    buildID = Configuration.get('nupic.software.buildNumber', 'N/A')
    logPrefix = '<BUILDID=%s, WORKER=DW, WRKID=%s, JOBID=%s> ' % \
                (buildID, wID, options.jobID)
    ExtendedLogger.setLogPrefix(logPrefix)


    # ---------------------------------------------------------------------
    # Instantiate the Dummy object, which will handle the logic of
    #  which models to create when we need more to evaluate.
    jobParams = json.loads(jobInfo.params)
    self.logger.info("Job Params: %s" % jobInfo.params)

    # prints the current status
    print >>sys.stderr, "reporter:status:Running dummy worker on job:%d" % \
                                                    (options.jobID)


    self.logger.info("Start of the dummy worker")
    startTime = time.time()
    runTime = jobParams['runTime']
    jobLoad = jobParams['load']
    crashJob = jobParams['crash']

    try:
      while True:
        if runTime != -1 and time.time() > startTime + runTime:
          break
        self.logger.info("In dummy worker")
        if jobLoad == 'heavy':
          # Computationally intensive process
          # Takes 0.8 sec approximately
          numIterations = 30000
          for i in range(numIterations):
            d = numpy.random.rand(1000).sum()
        else:
          time.sleep(0.8)
    except:
      self.logger.exception("DummyWorker exception;")

    if crashJob:
      self.logger.info("Crash of the dummy worker")
      print >>sys.stderr, "reporter:status:Crashed dummy worker..."
      raise RuntimeError("Simulating job crash.")
    else:
      self.logger.info("End of the dummy worker")
      print >>sys.stderr, "reporter:status:Finished dummy worker..."

    #import auxilary
    #auxilary.do_something()

    return options.jobID