Example #1
0
def validateOpfJsonValue(value, opfJsonSchemaFilename):
    """ Validate a python object against an OPF json schema file

  target:   target python object to validate (typically a dictionary)

  opfJsonSchemaFilename: OPF json schema filename containing the json schema
                  object. (e.g., opfTaskControlSchema.json)

  Returns: nothing

  Raises: jsonhelpers.ValidationError when value fails json validation
  """

    # Create a path by joining the filename with our local json schema root
    jsonSchemaPath = os.path.join(os.path.dirname(__file__), "jsonschema",
                                  opfJsonSchemaFilename)

    # Validate
    jsonhelpers.validate(value, schemaPath=jsonSchemaPath)

    return
Example #2
0
def validateOpfJsonValue(value, opfJsonSchemaFilename):
  """ Validate a python object against an OPF json schema file

  target:   target python object to validate (typically a dictionary)

  opfJsonSchemaFilename: OPF json schema filename containing the json schema
                  object. (e.g., opfTaskControlSchema.json)

  Returns: nothing

  Raises: jsonhelpers.ValidationError when value fails json validation
  """

  # Create a path by joining the filename with our local json schema root
  jsonSchemaPath = os.path.join(os.path.dirname(__file__),
                                "jsonschema",
                                opfJsonSchemaFilename)

  # Validate
  jsonhelpers.validate(value, schemaPath=jsonSchemaPath)

  return
Example #3
0
def _runExperimentImpl(options, model=None):
    """Creates and runs the experiment

  Args:
    options: namedtuple ParseCommandLineOptionsResult
    model: For testing: may pass in an existing OPF Model instance
        to use instead of creating a new one.

  Returns: referece to OPFExperiment instance that was constructed (this
      is provided to aid with debugging) or None, if none was
      created.
  """
    jsonhelpers.validate(options.privateOptions, schemaDict=g_parsedPrivateCommandLineOptionsSchema)

    # Load the experiment's description.py module
    experimentDir = options.experimentDir
    descriptionPyModule = opfhelpers.loadExperimentDescriptionScriptFromDir(experimentDir)
    expIface = opfhelpers.getExperimentDescriptionInterfaceFromModule(descriptionPyModule)

    # Handle "list checkpoints" request
    if options.privateOptions["listAvailableCheckpoints"]:
        _printAvailableCheckpoints(experimentDir)
        return None

    # Load experiment tasks
    experimentTasks = expIface.getModelControl().get("tasks", [])

    # If the tasks list is empty, and this is a nupic environment description
    # file being run from the OPF, convert it to a simple OPF description file.
    if len(experimentTasks) == 0 and expIface.getModelControl()["environment"] == OpfEnvironment.Nupic:
        expIface.convertNupicEnvToOPF()
        experimentTasks = expIface.getModelControl().get("tasks", [])

    # Handle listTasks
    if options.privateOptions["listTasks"]:
        print "Available tasks:"

        for label in [t["taskLabel"] for t in experimentTasks]:
            print "\t", label

        return None

    # Construct the experiment instance
    if options.privateOptions["runCheckpointName"]:

        assert model is None

        checkpointName = options.privateOptions["runCheckpointName"]

        model = ModelFactory.loadFromCheckpoint(savedModelDir=_getModelCheckpointDir(experimentDir, checkpointName))

    elif model is not None:
        print "Skipping creation of OPFExperiment instance: caller provided his own"
    else:
        modelDescription = expIface.getModelDescription()
        model = ModelFactory.create(modelDescription)

    # Handle "create model" request
    if options.privateOptions["createCheckpointName"]:
        checkpointName = options.privateOptions["createCheckpointName"]
        _saveModel(model=model, experimentDir=experimentDir, checkpointLabel=checkpointName)

        return model

    # Build the task list

    # Default task execution index list is in the natural list order of the tasks
    taskIndexList = range(len(experimentTasks))

    customTaskExecutionLabelsList = options.privateOptions["taskLabels"]
    if customTaskExecutionLabelsList:
        taskLabelsList = [t["taskLabel"] for t in experimentTasks]
        taskLabelsSet = set(taskLabelsList)

        customTaskExecutionLabelsSet = set(customTaskExecutionLabelsList)

        assert customTaskExecutionLabelsSet.issubset(taskLabelsSet), (
            "Some custom-provided task execution labels don't correspond "
            "to actual task labels: mismatched labels: %r; actual task "
            "labels: %r."
        ) % (customTaskExecutionLabelsSet - taskLabelsSet, customTaskExecutionLabelsList)

        taskIndexList = [taskLabelsList.index(label) for label in customTaskExecutionLabelsList]

        print "#### Executing custom task list: %r" % [taskLabelsList[i] for i in taskIndexList]

    # Run all experiment tasks
    for taskIndex in taskIndexList:

        task = experimentTasks[taskIndex]

        # Create a task runner and run it!
        taskRunner = _TaskRunner(model=model, task=task, cmdOptions=options)
        taskRunner.run()
        del taskRunner

        if options.privateOptions["checkpointModel"]:
            _saveModel(model=model, experimentDir=experimentDir, checkpointLabel=task["taskLabel"])

    return model
Example #4
0
  def __init__(self, streamDef, bookmark=None, saveOutput=False,
               isBlocking=True, maxTimeout=0, eofOnTimeout=False):
    """ Base class constructor, performs common initialization

    Parameters:
    ----------------------------------------------------------------
    streamDef:  The stream definition, potentially containing multiple sources
                (not supported yet). See
                /nupic/frameworks/opf/jsonschema/stream_def.json for the format
                of this dict

    bookmark: Bookmark to start reading from. This overrides the first_record
                field of the streamDef if provided.

    saveOutput: If true, save the output to a csv file in a temp directory.
                The path to the generated file can be found in the log
                output.

    isBlocking: should read operation block *forever* if the next row of data
                is not available, but the stream is not marked as 'completed'
                yet?

    maxTimeout: if isBlocking is False, max seconds to wait for more data before
                timing out; ignored when isBlocking is True.

    eofOnTimeout: If True and we get a read timeout (isBlocking must be False
                to get read timeouts), assume we've reached the end of the
                input and produce the last aggregated record, if one can be
                completed.

    """

    # Call superclass constructor
    super(StreamReader, self).__init__()

    loggerPrefix = 'com.numenta.nupic.data.StreamReader'
    self._logger = logging.getLogger(loggerPrefix)
    jsonhelpers.validate(streamDef,
                         schemaPath=pkg_resources.resource_filename(
                             jsonschema.__name__, "stream_def.json"))
    assert len(streamDef['streams']) == 1, "Only 1 source stream is supported"

    # Save constructor args
    sourceDict = streamDef['streams'][0]
    self._recordCount = 0
    self._eofOnTimeout = eofOnTimeout
    self._logger.debug('Reading stream with the def: %s', sourceDict)

    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None

    # ---------------------------------------------------------------------
    # Get the stream definition params

    # Limiting window of the stream. It would not return any records until
    # 'first_record' ID is read (or very first with the ID above that). The
    # stream will return EOS once it reads record with ID 'last_record' or
    # above (NOTE: the name 'lastRecord' is misleading because it is NOT
    #  inclusive).
    firstRecordIdx = sourceDict.get('first_record', None)
    self._sourceLastRecordIdx = sourceDict.get('last_record', None)

    # If a bookmark was given, then override first_record from the stream
    #  definition.
    if bookmark is not None:
      firstRecordIdx = None


    # Column names must be provided in the streamdef json
    # Special case is ['*'], meaning all available names from the record stream
    self._streamFieldNames = sourceDict.get('columns', None)
    if self._streamFieldNames != None and self._streamFieldNames[0] == '*':
      self._needFieldsFiltering = False
    else:
      self._needFieldsFiltering = True

    # Types must be specified in streamdef json, or in case of the
    #  file_recod_stream types could be implicit from the file
    streamFieldTypes = sourceDict.get('types', None)
    self._logger.debug('Types from the def: %s', streamFieldTypes)
    # Validate that all types are valid
    if streamFieldTypes is not None:
      for dataType in streamFieldTypes:
        assert FieldMetaType.isValid(dataType)

    # Reset, sequence and time fields might be provided by streamdef json
    streamResetFieldName = streamDef.get('resetField', None)
    streamTimeFieldName = streamDef.get('timeField', None)
    streamSequenceFieldName = streamDef.get('sequenceIdField', None)
    self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName,
                                                      streamTimeFieldName,
                                                      streamSequenceFieldName)


    # =======================================================================
    # Open up the underlying record store
    dataUrl = sourceDict.get('source', None)
    assert dataUrl is not None
    self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout,
                                         bookmark, firstRecordIdx)
    assert self._recordStore is not None


    # =======================================================================
    # Prepare the data structures we need for returning just the fields
    #  the caller wants from each record
    recordStoreFields = self._recordStore.getFields()
    self._recordStoreFieldNames = self._recordStore.getFieldNames()

    if not self._needFieldsFiltering:
      self._streamFieldNames = self._recordStoreFieldNames

    # Build up the field definitions for each field. This is a list of tuples
    #  of (name, type, special)
    self._streamFields = []
    for dstIdx, name in enumerate(self._streamFieldNames):
      if name not in self._recordStoreFieldNames:
        raise RuntimeError("The column '%s' from the stream definition "
          "is not present in the underlying stream which has the following "
          "columns: %s" % (name, self._recordStoreFieldNames))

      fieldIdx = self._recordStoreFieldNames.index(name)
      fieldType = recordStoreFields[fieldIdx].type
      fieldSpecial = recordStoreFields[fieldIdx].special

      # If the types or specials were defined in the stream definition,
      #   then override what was found in the record store
      if streamFieldTypes is not None:
        fieldType = streamFieldTypes[dstIdx]

      if streamResetFieldName is not None and streamResetFieldName == name:
        fieldSpecial = FieldMetaSpecial.reset
      if streamTimeFieldName is not None and streamTimeFieldName == name:
        fieldSpecial = FieldMetaSpecial.timestamp
      if (streamSequenceFieldName is not None and
          streamSequenceFieldName == name):
        fieldSpecial = FieldMetaSpecial.sequence

      self._streamFields.append(FieldMetaInfo(name, fieldType, fieldSpecial))


    # ========================================================================
    # Create the aggregator which will handle aggregation of records before
    #  returning them.
    self._aggregator = Aggregator(
            aggregationInfo=streamDef.get('aggregation', None),
            inputFields=recordStoreFields,
            timeFieldName=streamDef.get('timeField', None),
            sequenceIdFieldName=streamDef.get('sequenceIdField', None),
            resetFieldName=streamDef.get('resetField', None))

    # We rely on the aggregator to tell us the bookmark of the last raw input
    #  that contributed to the aggregated record
    self._aggBookmark = None

    # Compute the aggregation period in terms of months and seconds
    if 'aggregation' in streamDef:
      self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds(
                streamDef.get('aggregation'))
    else:
      self._aggMonthsAndSeconds = None


    # ========================================================================
    # Are we saving the generated output to a csv?
    if saveOutput:
      tmpDir = tempfile.mkdtemp()
      outFilename = os.path.join(tmpDir, "generated_output.csv")
      self._logger.info("StreamReader: Saving generated records to: '%s'" %
                        outFilename)
      self._writer = FileRecordStream(streamID=outFilename,
                                      write=True,
                                      fields=self._streamFields)
    else:
      self._writer = None
Example #5
0
def _runExperimentImpl(options, model=None):
    """Creates and runs the experiment

  Args:
    options: namedtuple ParseCommandLineOptionsResult
    model: For testing: may pass in an existing OPF Model instance
        to use instead of creating a new one.

  Returns: reference to OPFExperiment instance that was constructed (this
      is provided to aid with debugging) or None, if none was
      created.
  """
    jsonhelpers.validate(options.privateOptions,
                         schemaDict=g_parsedPrivateCommandLineOptionsSchema)

    # Load the experiment's description.py module
    experimentDir = options.experimentDir
    descriptionPyModule = opf_helpers.loadExperimentDescriptionScriptFromDir(
        experimentDir)
    expIface = opf_helpers.getExperimentDescriptionInterfaceFromModule(
        descriptionPyModule)

    # Handle "list checkpoints" request
    if options.privateOptions['listAvailableCheckpoints']:
        _printAvailableCheckpoints(experimentDir)
        return None

    # Load experiment tasks
    experimentTasks = expIface.getModelControl().get('tasks', [])

    # If the tasks list is empty, and this is a nupic environment description
    # file being run from the OPF, convert it to a simple OPF description file.
    if (len(experimentTasks) == 0 and expIface.getModelControl()['environment']
            == OpfEnvironment.Nupic):
        expIface.convertNupicEnvToOPF()
        experimentTasks = expIface.getModelControl().get('tasks', [])

    # Ensures all the source locations are either absolute paths or relative to
    # the nupic.datafiles package_data location.
    expIface.normalizeStreamSources()

    # Extract option
    newSerialization = options.privateOptions['newSerialization']

    # Handle listTasks
    if options.privateOptions['listTasks']:
        print "Available tasks:"

        for label in [t['taskLabel'] for t in experimentTasks]:
            print "\t", label

        return None

    # Construct the experiment instance
    if options.privateOptions['runCheckpointName']:

        assert model is None

        checkpointName = options.privateOptions['runCheckpointName']

        model = ModelFactory.loadFromCheckpoint(
            savedModelDir=_getModelCheckpointDir(experimentDir,
                                                 checkpointName),
            newSerialization=newSerialization)

    elif model is not None:
        print "Skipping creation of OPFExperiment instance: caller provided his own"
    else:
        modelDescription = expIface.getModelDescription()
        model = ModelFactory.create(modelDescription)

    # Handle "create model" request
    if options.privateOptions['createCheckpointName']:
        checkpointName = options.privateOptions['createCheckpointName']
        _saveModel(model=model,
                   experimentDir=experimentDir,
                   checkpointLabel=checkpointName,
                   newSerialization=newSerialization)

        return model

    # Build the task list

    # Default task execution index list is in the natural list order of the tasks
    taskIndexList = range(len(experimentTasks))

    customTaskExecutionLabelsList = options.privateOptions['taskLabels']
    if customTaskExecutionLabelsList:
        taskLabelsList = [t['taskLabel'] for t in experimentTasks]
        taskLabelsSet = set(taskLabelsList)

        customTaskExecutionLabelsSet = set(customTaskExecutionLabelsList)

        assert customTaskExecutionLabelsSet.issubset(taskLabelsSet), \
               ("Some custom-provided task execution labels don't correspond "
                "to actual task labels: mismatched labels: %r; actual task "
                "labels: %r.") % (customTaskExecutionLabelsSet - taskLabelsSet,
                                  customTaskExecutionLabelsList)

        taskIndexList = [
            taskLabelsList.index(label)
            for label in customTaskExecutionLabelsList
        ]

        print "#### Executing custom task list: %r" % [
            taskLabelsList[i] for i in taskIndexList
        ]

    # Run all experiment tasks
    for taskIndex in taskIndexList:

        task = experimentTasks[taskIndex]

        # Create a task runner and run it!
        taskRunner = _TaskRunner(model=model, task=task, cmdOptions=options)
        taskRunner.run()
        del taskRunner

        if options.privateOptions['checkpointModel']:
            _saveModel(model=model,
                       experimentDir=experimentDir,
                       checkpointLabel=task['taskLabel'],
                       newSerialization=newSerialization)

    return model
Example #6
0
  def run(self):
    """ Run this worker.

    Parameters:
    ----------------------------------------------------------------------
    retval:     jobID of the job we ran. This is used by unit test code
                  when calling this working using the --params command
                  line option (which tells this worker to insert the job
                  itself).
    """
    # Easier access to options
    options = self._options

    # ---------------------------------------------------------------------
    # Connect to the jobs database
    self.logger.info("Connecting to the jobs database")
    cjDAO = ClientJobsDAO.get()

    # Get our worker ID
    self._workerID = cjDAO.getConnectionID()

    if options.clearModels:
      cjDAO.modelsClearAll()

    # -------------------------------------------------------------------------
    # if params were specified on the command line, insert a new job using
    #  them.
    if options.params is not None:
      options.jobID = cjDAO.jobInsert(client='hwTest', cmdLine="echo 'test mode'",
                  params=options.params, alreadyRunning=True,
                  minimumWorkers=1, maximumWorkers=1,
                  jobType = cjDAO.JOB_TYPE_HS)
    if options.workerID is not None:
      wID = options.workerID
    else:
      wID = self._workerID
    
    buildID = Configuration.get('nupic.software.buildNumber', 'N/A')
    logPrefix = '<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> ' % \
                (buildID, wID, options.jobID)
    ExtendedLogger.setLogPrefix(logPrefix)

    # ---------------------------------------------------------------------
    # Get the search parameters
    # If asked to reset the job status, do that now
    if options.resetJobStatus:
      cjDAO.jobSetFields(options.jobID,
           fields={'workerCompletionReason': ClientJobsDAO.CMPL_REASON_SUCCESS,
                   'cancel': False,
                   #'engWorkerState': None
                   },
           useConnectionID=False,
           ignoreUnchanged=True)
    jobInfo = cjDAO.jobInfo(options.jobID)
    self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo))))


    # ---------------------------------------------------------------------
    # Instantiate the swarm object, which will handle the logic of
    #  which models to create when we need more to evaluate.
    jobParams = json.loads(jobInfo.params)

    # Validate job params
    jsonSchemaPath = os.path.join(os.path.dirname(__file__),
                                  "jsonschema",
                                  "jobParamsSchema.json")
    jsonhelpers.validate(jobParams, schemaPath=jsonSchemaPath)


    hsVersion = jobParams.get('hsVersion', None)
    if hsVersion == 'v2':
      self._hs = SwarmV2(searchParams=jobParams, workerID=self._workerID,
              cjDAO=cjDAO, jobID=options.jobID, logLevel=options.logLevel)
    else:
      raise RuntimeError("Invalid swarm implementation (%s) specified" \
                          % (hsVersion))


    # =====================================================================
    # The main loop.
    try:
      exit = False
      numModelsTotal = 0
      print >>sys.stderr, "reporter:status:Evaluating first model..."
      while not exit:

        # ------------------------------------------------------------------
        # Choose a model to evaluate
        batchSize = 10              # How many to try at a time.
        modelIDToRun = None
        while modelIDToRun is None:

          if options.modelID is None:
            # -----------------------------------------------------------------
            # Get the latest results on all running models and send them to
            #  the swarm implementation
            # This calls cjDAO.modelsGetUpdateCounters(), compares the
            # updateCounters with what we have cached, fetches the results for the
            # changed and new models, and sends those to the swarm
            # implementation's self._hs.recordModelProgress() method.
            self._processUpdatedModels(cjDAO)
  
            # --------------------------------------------------------------------
            # Create a new batch of models
            (exit, newModels) = self._hs.createModels(numModels = batchSize)
            if exit:
              break

            # No more models left to create, just loop. The _hs is waiting for
            #   all remaining running models to complete, and may pick up on an
            #  orphan if it detects one.
            if len(newModels) == 0:
              continue
  
            # Try and insert one that we will run
            for (modelParams, modelParamsHash, particleHash) in newModels:
              jsonModelParams = json.dumps(modelParams)
              (modelID, ours) = cjDAO.modelInsertAndStart(options.jobID,
                                  jsonModelParams, modelParamsHash, particleHash)
  
              # Some other worker is already running it, tell the swarm object
              #  so that it doesn't try and insert it again
              if not ours:
                mParamsAndHash = cjDAO.modelsGetParams([modelID])[0]
                mResult = cjDAO.modelsGetResultAndStatus([modelID])[0]
                results = mResult.results
                if results is not None:
                  results = json.loads(results)
  
                modelParams = json.loads(mParamsAndHash.params)
                particleHash = cjDAO.modelsGetFields(modelID, 
                                  ['engParticleHash'])[0]
                particleInst = "%s.%s" % (
                          modelParams['particleState']['id'],
                          modelParams['particleState']['genIdx'])
                self.logger.info("Adding model %d to our internal DB " \
                      "because modelInsertAndStart() failed to insert it: " \
                      "paramsHash=%s, particleHash=%s, particleId='%s'", modelID, 
                      mParamsAndHash.engParamsHash.encode('hex'),
                      particleHash.encode('hex'), particleInst)
                self._hs.recordModelProgress(modelID = modelID,
                      modelParams = modelParams,
                      modelParamsHash = mParamsAndHash.engParamsHash,
                      results = results,
                      completed = (mResult.status == cjDAO.STATUS_COMPLETED),
                      completionReason = mResult.completionReason,
                      matured = mResult.engMatured,
                      numRecords = mResult.numRecords)
              else:
                modelIDToRun = modelID
                break
  
          else:
            # A specific modelID was passed on the command line
            modelIDToRun = int(options.modelID)
            mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun])[0]
            modelParams = json.loads(mParamsAndHash.params)
            modelParamsHash = mParamsAndHash.engParamsHash
            
            # Make us the worker
            cjDAO.modelSetFields(modelIDToRun,
                                     dict(engWorkerConnId=self._workerID))
            if False:
              # Change the hash and params of the old entry so that we can
              #  create a new model with the same params
              for attempt in range(1000):
                paramsHash = hashlib.md5("OrphanParams.%d.%d" % (modelIDToRun,
                                                                 attempt)).digest()
                particleHash = hashlib.md5("OrphanParticle.%d.%d" % (modelIDToRun,
                                                                  attempt)).digest()
                try:
                  cjDAO.modelSetFields(modelIDToRun,
                                           dict(engParamsHash=paramsHash,
                                                engParticleHash=particleHash))
                  success = True
                except:
                  success = False
                if success:
                  break
              if not success:
                raise RuntimeError("Unexpected failure to change paramsHash and "
                                   "particleHash of orphaned model")
              
              (modelIDToRun, ours) = cjDAO.modelInsertAndStart(options.jobID,
                                  mParamsAndHash.params, modelParamsHash)

            
            
            # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

        # ---------------------------------------------------------------
        # We have a model, evaluate it now
        # All done?
        if exit:
          break

        # Run the model now
        self.logger.info("RUNNING MODEL GID=%d, paramsHash=%s, params=%s",
              modelIDToRun, modelParamsHash.encode('hex'), modelParams)

        # ---------------------------------------------------------------------
        # Construct model checkpoint GUID for this model:
        # jobParams['persistentJobGUID'] contains the client's (e.g., API Server)
        # persistent, globally-unique model identifier, which is what we need;
        persistentJobGUID = jobParams['persistentJobGUID']
        assert persistentJobGUID, "persistentJobGUID: %r" % (persistentJobGUID,)

        modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + (
          '_' + str(modelIDToRun))


        self._hs.runModel(modelID=modelIDToRun, jobID = options.jobID,
                          modelParams=modelParams, modelParamsHash=modelParamsHash,
                          jobsDAO=cjDAO, modelCheckpointGUID=modelCheckpointGUID)

        # TODO: don't increment for orphaned models
        numModelsTotal += 1

        self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs",
          modelIDToRun, numModelsTotal)
        print >>sys.stderr, "reporter:status:Evaluated %d models..." % \
                                    (numModelsTotal)
        print >>sys.stderr, "reporter:counter:swarmWorker,numModels,1"

        if options.modelID is not None:
          exit = True
        # ^^^ end while not exit

    finally:
      # Provide swarm instance an opportunity to clean up temporary files
      self._hs.close()

    self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal))
    print >>sys.stderr, "reporter:status:Finished, evaluated %d models" % (numModelsTotal)
    return options.jobID
Example #7
0
    def __init__(self,
                 streamDef,
                 bookmark=None,
                 saveOutput=False,
                 isBlocking=True,
                 maxTimeout=0,
                 eofOnTimeout=False):
        """ Base class constructor, performs common initialization

    Parameters:
    ----------------------------------------------------------------
    streamDef:  The stream definition, potentially containing multiple sources
                (not supported yet). See
                /nupic/frameworks/opf/jsonschema/stream_def.json for the format
                of this dict

    bookmark: Bookmark to start reading from. This overrides the first_record
                field of the streamDef if provided.

    saveOutput: If true, save the output to a csv file in a temp directory.
                The path to the generated file can be found in the log
                output.

    isBlocking: should read operation block *forever* if the next row of data
                is not available, but the stream is not marked as 'completed'
                yet?

    maxTimeout: if isBlocking is False, max seconds to wait for more data before
                timing out; ignored when isBlocking is True.

    eofOnTimeout: If True and we get a read timeout (isBlocking must be False
                to get read timeouts), assume we've reached the end of the
                input and produce the last aggregated record, if one can be
                completed.

    """

        # Call superclass constructor
        super(StreamReader, self).__init__()

        loggerPrefix = 'com.numenta.nupic.data.StreamReader'
        self._logger = logging.getLogger(loggerPrefix)
        jsonhelpers.validate(streamDef,
                             schemaPath=pkg_resources.resource_filename(
                                 jsonschema.__name__, "stream_def.json"))
        assert len(
            streamDef['streams']) == 1, "Only 1 source stream is supported"

        # Save constructor args
        sourceDict = streamDef['streams'][0]
        self._recordCount = 0
        self._eofOnTimeout = eofOnTimeout
        self._logger.debug('Reading stream with the def: %s', sourceDict)

        # Dictionary to store record statistics (min and max of scalars for now)
        self._stats = None

        # ---------------------------------------------------------------------
        # Get the stream definition params

        # Limiting window of the stream. It would not return any records until
        # 'first_record' ID is read (or very first with the ID above that). The
        # stream will return EOS once it reads record with ID 'last_record' or
        # above (NOTE: the name 'lastRecord' is misleading because it is NOT
        #  inclusive).
        firstRecordIdx = sourceDict.get('first_record', None)
        self._sourceLastRecordIdx = sourceDict.get('last_record', None)

        # If a bookmark was given, then override first_record from the stream
        #  definition.
        if bookmark is not None:
            firstRecordIdx = None

        # Column names must be provided in the streamdef json
        # Special case is ['*'], meaning all available names from the record stream
        self._streamFieldNames = sourceDict.get('columns', None)
        if self._streamFieldNames != None and self._streamFieldNames[0] == '*':
            self._needFieldsFiltering = False
        else:
            self._needFieldsFiltering = True

        # Types must be specified in streamdef json, or in case of the
        #  file_recod_stream types could be implicit from the file
        streamFieldTypes = sourceDict.get('types', None)
        self._logger.debug('Types from the def: %s', streamFieldTypes)
        # Validate that all types are valid
        if streamFieldTypes is not None:
            for dataType in streamFieldTypes:
                assert FieldMetaType.isValid(dataType)

        # Reset, sequence and time fields might be provided by streamdef json
        streamResetFieldName = streamDef.get('resetField', None)
        streamTimeFieldName = streamDef.get('timeField', None)
        streamSequenceFieldName = streamDef.get('sequenceIdField', None)
        self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName,
                           streamTimeFieldName, streamSequenceFieldName)

        # =======================================================================
        # Open up the underlying record store
        dataUrl = sourceDict.get('source', None)
        assert dataUrl is not None
        self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout,
                                             bookmark, firstRecordIdx)
        assert self._recordStore is not None

        # =======================================================================
        # Prepare the data structures we need for returning just the fields
        #  the caller wants from each record
        recordStoreFields = self._recordStore.getFields()
        self._recordStoreFieldNames = self._recordStore.getFieldNames()

        if not self._needFieldsFiltering:
            self._streamFieldNames = self._recordStoreFieldNames

        # Build up the field definitions for each field. This is a list of tuples
        #  of (name, type, special)
        self._streamFields = []
        for dstIdx, name in enumerate(self._streamFieldNames):
            if name not in self._recordStoreFieldNames:
                raise RuntimeError(
                    "The column '%s' from the stream definition "
                    "is not present in the underlying stream which has the following "
                    "columns: %s" % (name, self._recordStoreFieldNames))

            fieldIdx = self._recordStoreFieldNames.index(name)
            fieldType = recordStoreFields[fieldIdx].type
            fieldSpecial = recordStoreFields[fieldIdx].special

            # If the types or specials were defined in the stream definition,
            #   then override what was found in the record store
            if streamFieldTypes is not None:
                fieldType = streamFieldTypes[dstIdx]

            if streamResetFieldName is not None and streamResetFieldName == name:
                fieldSpecial = FieldMetaSpecial.reset
            if streamTimeFieldName is not None and streamTimeFieldName == name:
                fieldSpecial = FieldMetaSpecial.timestamp
            if (streamSequenceFieldName is not None
                    and streamSequenceFieldName == name):
                fieldSpecial = FieldMetaSpecial.sequence

            self._streamFields.append(
                FieldMetaInfo(name, fieldType, fieldSpecial))

        # ========================================================================
        # Create the aggregator which will handle aggregation of records before
        #  returning them.
        self._aggregator = Aggregator(
            aggregationInfo=streamDef.get('aggregation', None),
            inputFields=recordStoreFields,
            timeFieldName=streamDef.get('timeField', None),
            sequenceIdFieldName=streamDef.get('sequenceIdField', None),
            resetFieldName=streamDef.get('resetField', None))

        # We rely on the aggregator to tell us the bookmark of the last raw input
        #  that contributed to the aggregated record
        self._aggBookmark = None

        # Compute the aggregation period in terms of months and seconds
        if 'aggregation' in streamDef:
            self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds(
                streamDef.get('aggregation'))
        else:
            self._aggMonthsAndSeconds = None

        # ========================================================================
        # Are we saving the generated output to a csv?
        if saveOutput:
            tmpDir = tempfile.mkdtemp()
            outFilename = os.path.join(tmpDir, "generated_output.csv")
            self._logger.info(
                "StreamReader: Saving generated records to: '%s'" %
                outFilename)
            self._writer = FileRecordStream(streamID=outFilename,
                                            write=True,
                                            fields=self._streamFields)
        else:
            self._writer = None
Example #8
0
    def run(self):
        """ Run this worker.

    Parameters:
    ----------------------------------------------------------------------
    retval:     jobID of the job we ran. This is used by unit test code
                  when calling this working using the --params command
                  line option (which tells this worker to insert the job
                  itself).
    """
        # Easier access to options
        options = self._options

        # ---------------------------------------------------------------------
        # Connect to the jobs database
        self.logger.info("Connecting to the jobs database")
        cjDAO = ClientJobsDAO.get()

        # Get our worker ID
        self._workerID = cjDAO.getConnectionID()

        if options.clearModels:
            cjDAO.modelsClearAll()

        # -------------------------------------------------------------------------
        # if params were specified on the command line, insert a new job using
        #  them.
        if options.params is not None:
            options.jobID = cjDAO.jobInsert(client='hwTest',
                                            cmdLine="echo 'test mode'",
                                            params=options.params,
                                            alreadyRunning=True,
                                            minimumWorkers=1,
                                            maximumWorkers=1,
                                            jobType=cjDAO.JOB_TYPE_HS)
        if options.workerID is not None:
            wID = options.workerID
        else:
            wID = self._workerID

        buildID = Configuration.get('nupic.software.buildNumber', 'N/A')
        logPrefix = '<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> ' % \
                    (buildID, wID, options.jobID)
        ExtendedLogger.setLogPrefix(logPrefix)

        # ---------------------------------------------------------------------
        # Get the search parameters
        # If asked to reset the job status, do that now
        if options.resetJobStatus:
            cjDAO.jobSetFields(
                options.jobID,
                fields={
                    'workerCompletionReason':
                    ClientJobsDAO.CMPL_REASON_SUCCESS,
                    'cancel': False,
                    #'engWorkerState': None
                },
                useConnectionID=False,
                ignoreUnchanged=True)
        jobInfo = cjDAO.jobInfo(options.jobID)
        self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo))))

        # ---------------------------------------------------------------------
        # Instantiate the swarm object, which will handle the logic of
        #  which models to create when we need more to evaluate.
        jobParams = json.loads(jobInfo.params)

        # Validate job params
        jsonSchemaPath = os.path.join(os.path.dirname(__file__), "jsonschema",
                                      "jobParamsSchema.json")
        jsonhelpers.validate(jobParams, schemaPath=jsonSchemaPath)

        hsVersion = jobParams.get('hsVersion', None)
        if hsVersion == 'v2':
            self._hs = SwarmV2(searchParams=jobParams,
                               workerID=self._workerID,
                               cjDAO=cjDAO,
                               jobID=options.jobID,
                               logLevel=options.logLevel)
        else:
            raise RuntimeError("Invalid swarm implementation (%s) specified" \
                                % (hsVersion))

        # =====================================================================
        # The main loop.
        try:
            exit = False
            numModelsTotal = 0
            print >> sys.stderr, "reporter:status:Evaluating first model..."
            while not exit:

                # ------------------------------------------------------------------
                # Choose a model to evaluate
                batchSize = 10  # How many to try at a time.
                modelIDToRun = None
                while modelIDToRun is None:

                    if options.modelID is None:
                        # -----------------------------------------------------------------
                        # Get the latest results on all running models and send them to
                        #  the swarm implementation
                        # This calls cjDAO.modelsGetUpdateCounters(), compares the
                        # updateCounters with what we have cached, fetches the results for the
                        # changed and new models, and sends those to the swarm
                        # implementation's self._hs.recordModelProgress() method.
                        self._processUpdatedModels(cjDAO)

                        # --------------------------------------------------------------------
                        # Create a new batch of models
                        (exit, newModels) = self._hs.createModels(
                            numModels=batchSize)
                        if exit:
                            break

                        # No more models left to create, just loop. The _hs is waiting for
                        #   all remaining running models to complete, and may pick up on an
                        #  orphan if it detects one.
                        if len(newModels) == 0:
                            continue

                        # Try and insert one that we will run
                        for (modelParams, modelParamsHash,
                             particleHash) in newModels:
                            jsonModelParams = json.dumps(modelParams)
                            (modelID, ours) = cjDAO.modelInsertAndStart(
                                options.jobID, jsonModelParams,
                                modelParamsHash, particleHash)

                            # Some other worker is already running it, tell the swarm object
                            #  so that it doesn't try and insert it again
                            if not ours:
                                mParamsAndHash = cjDAO.modelsGetParams(
                                    [modelID])[0]
                                mResult = cjDAO.modelsGetResultAndStatus(
                                    [modelID])[0]
                                results = mResult.results
                                if results is not None:
                                    results = json.loads(results)

                                modelParams = json.loads(mParamsAndHash.params)
                                particleHash = cjDAO.modelsGetFields(
                                    modelID, ['engParticleHash'])[0]
                                particleInst = "%s.%s" % (
                                    modelParams['particleState']['id'],
                                    modelParams['particleState']['genIdx'])
                                self.logger.info("Adding model %d to our internal DB " \
                                      "because modelInsertAndStart() failed to insert it: " \
                                      "paramsHash=%s, particleHash=%s, particleId='%s'", modelID,
                                      mParamsAndHash.engParamsHash.encode('hex'),
                                      particleHash.encode('hex'), particleInst)
                                self._hs.recordModelProgress(
                                    modelID=modelID,
                                    modelParams=modelParams,
                                    modelParamsHash=mParamsAndHash.
                                    engParamsHash,
                                    results=results,
                                    completed=(mResult.status ==
                                               cjDAO.STATUS_COMPLETED),
                                    completionReason=mResult.completionReason,
                                    matured=mResult.engMatured,
                                    numRecords=mResult.numRecords)
                            else:
                                modelIDToRun = modelID
                                break

                    else:
                        # A specific modelID was passed on the command line
                        modelIDToRun = int(options.modelID)
                        mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun
                                                                ])[0]
                        modelParams = json.loads(mParamsAndHash.params)
                        modelParamsHash = mParamsAndHash.engParamsHash

                        # Make us the worker
                        cjDAO.modelSetFields(
                            modelIDToRun, dict(engWorkerConnId=self._workerID))
                        if False:
                            # Change the hash and params of the old entry so that we can
                            #  create a new model with the same params
                            for attempt in range(1000):
                                paramsHash = hashlib.md5(
                                    "OrphanParams.%d.%d" %
                                    (modelIDToRun, attempt)).digest()
                                particleHash = hashlib.md5(
                                    "OrphanParticle.%d.%d" %
                                    (modelIDToRun, attempt)).digest()
                                try:
                                    cjDAO.modelSetFields(
                                        modelIDToRun,
                                        dict(engParamsHash=paramsHash,
                                             engParticleHash=particleHash))
                                    success = True
                                except:
                                    success = False
                                if success:
                                    break
                            if not success:
                                raise RuntimeError(
                                    "Unexpected failure to change paramsHash and "
                                    "particleHash of orphaned model")

                            (modelIDToRun, ours) = cjDAO.modelInsertAndStart(
                                options.jobID, mParamsAndHash.params,
                                modelParamsHash)

                        # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

                # ---------------------------------------------------------------
                # We have a model, evaluate it now
                # All done?
                if exit:
                    break

                # Run the model now
                self.logger.info(
                    "RUNNING MODEL GID=%d, paramsHash=%s, params=%s",
                    modelIDToRun, modelParamsHash.encode('hex'), modelParams)

                # ---------------------------------------------------------------------
                # Construct model checkpoint GUID for this model:
                # jobParams['persistentJobGUID'] contains the client's (e.g., API Server)
                # persistent, globally-unique model identifier, which is what we need;
                persistentJobGUID = jobParams['persistentJobGUID']
                assert persistentJobGUID, "persistentJobGUID: %r" % (
                    persistentJobGUID, )

                modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + (
                    '_' + str(modelIDToRun))

                self._hs.runModel(modelID=modelIDToRun,
                                  jobID=options.jobID,
                                  modelParams=modelParams,
                                  modelParamsHash=modelParamsHash,
                                  jobsDAO=cjDAO,
                                  modelCheckpointGUID=modelCheckpointGUID)

                # TODO: don't increment for orphaned models
                numModelsTotal += 1

                self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs",
                                 modelIDToRun, numModelsTotal)
                print >>sys.stderr, "reporter:status:Evaluated %d models..." % \
                                            (numModelsTotal)
                print >> sys.stderr, "reporter:counter:swarmWorker,numModels,1"

                if options.modelID is not None:
                    exit = True
                # ^^^ end while not exit

        finally:
            # Provide swarm instance an opportunity to clean up temporary files
            self._hs.close()

        self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal))
        print >> sys.stderr, "reporter:status:Finished, evaluated %d models" % (
            numModelsTotal)
        return options.jobID
Example #9
0
  def __init__(self, streamDef, bookmark=None, saveOutput=False,
               isBlocking=True, maxTimeout=0, eofOnTimeout=False):
    """ Base class constructor, performs common initialization

    Parameters:
    ----------------------------------------------------------------
    streamDef:  The stream definition, potentially containing multiple sources
                (not supported yet). See
                /nupic/frameworks/opf/jsonschema/stream_def.json for the format
                of this dict

    bookmark: Bookmark to start reading from. This overrides the first_record
                field of the streamDef if provided.

    saveOutput: If true, save the output to a csv file in a temp directory.
                The path to the generated file can be found in the log
                output.

    isBlocking: should read operation block *forever* if the next row of data
                is not available, but the stream is not marked as 'completed'
                yet?

    maxTimeout: if isBlocking is False, max seconds to wait for more data before
                timing out; ignored when isBlocking is True.

    eofOnTimeout: If True and we get a read timeout (isBlocking must be False
                to get read timeouts), assume we've reached the end of the
                input and produce the last aggregated record, if one can be
                completed.

    """

    # Call superclass constructor
    super(StreamReader, self).__init__()

    loggerPrefix = 'com.numenta.nupic.data.StreamReader'
    self._logger = logging.getLogger(loggerPrefix)
    jsonhelpers.validate(streamDef,
                         schemaPath=resource_filename(
                             jsonschema.__name__, "stream_def.json"))
    assert len(streamDef['streams']) == 1, "Only 1 source stream is supported"

    # Save constructor args
    sourceDict = streamDef['streams'][0]
    self._recordCount = 0
    self._eofOnTimeout = eofOnTimeout
    self._logger.debug('Reading stream with the def: %s', sourceDict)

    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None

    # ---------------------------------------------------------------------
    # Get the stream definition params

    # Limiting window of the stream. It would not return any records until
    # 'first_record' ID is read (or very first with the ID above that). The
    # stream will return EOS once it reads record with ID 'last_record' or
    # above (NOTE: the name 'lastRecord' is misleading because it is NOT
    #  inclusive).
    firstRecordIdx = sourceDict.get('first_record', None)
    self._sourceLastRecordIdx = sourceDict.get('last_record', None)

    # If a bookmark was given, then override first_record from the stream
    #  definition.
    if bookmark is not None:
      firstRecordIdx = None


    # Column names must be provided in the streamdef json
    # Special case is ['*'], meaning all available names from the record stream
    self._streamFieldNames = sourceDict.get('columns', None)
    if self._streamFieldNames != None and self._streamFieldNames[0] == '*':
      self._needFieldsFiltering = False
    else:
      self._needFieldsFiltering = True

    # Types must be specified in streamdef json, or in case of the
    #  file_recod_stream types could be implicit from the file
    streamFieldTypes = sourceDict.get('types', None)
    self._logger.debug('Types from the def: %s', streamFieldTypes)
    # Validate that all types are valid
    if streamFieldTypes != None:
      for dataType in streamFieldTypes:
        assert(dataType in TYPES)

    # Reset, sequence and time fields might be provided by streamdef json
    streamResetFieldName = streamDef.get('resetField', None)
    streamTimeFieldName = streamDef.get('timeField', None)
    streamSequenceFieldName = streamDef.get('sequenceIdField', None)
    self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName,
                                                      streamTimeFieldName,
                                                      streamSequenceFieldName)


    # ================================