Python validate Examples

Programming Language: Python

Namespace/Package Name: nupic.data.jsonhelpers

Method/Function: validate

Examples at hotexamples.com: 9

Python validate - 9 examples found. These are the top rated real world Python examples of nupic.data.jsonhelpers.validate extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def validateOpfJsonValue(value, opfJsonSchemaFilename):
    """ Validate a python object against an OPF json schema file

  target:   target python object to validate (typically a dictionary)

  opfJsonSchemaFilename: OPF json schema filename containing the json schema
                  object. (e.g., opfTaskControlSchema.json)

  Returns: nothing

  Raises: jsonhelpers.ValidationError when value fails json validation
  """

    # Create a path by joining the filename with our local json schema root
    jsonSchemaPath = os.path.join(os.path.dirname(__file__), "jsonschema",
                                  opfJsonSchemaFilename)

    # Validate
    jsonhelpers.validate(value, schemaPath=jsonSchemaPath)

    return

Example #2

Show file

File: opfutils.py Project: AI-Cdrone/nupic

def validateOpfJsonValue(value, opfJsonSchemaFilename):
  """ Validate a python object against an OPF json schema file

  target:   target python object to validate (typically a dictionary)

  opfJsonSchemaFilename: OPF json schema filename containing the json schema
                  object. (e.g., opfTaskControlSchema.json)

  Returns: nothing

  Raises: jsonhelpers.ValidationError when value fails json validation
  """

  # Create a path by joining the filename with our local json schema root
  jsonSchemaPath = os.path.join(os.path.dirname(__file__),
                                "jsonschema",
                                opfJsonSchemaFilename)

  # Validate
  jsonhelpers.validate(value, schemaPath=jsonSchemaPath)

  return

Example #3

Show file

File: experiment_runner.py Project: Gnomonol/nupic

def _runExperimentImpl(options, model=None):
    """Creates and runs the experiment

  Args:
    options: namedtuple ParseCommandLineOptionsResult
    model: For testing: may pass in an existing OPF Model instance
        to use instead of creating a new one.

  Returns: referece to OPFExperiment instance that was constructed (this
      is provided to aid with debugging) or None, if none was
      created.
  """
    jsonhelpers.validate(options.privateOptions, schemaDict=g_parsedPrivateCommandLineOptionsSchema)

    # Load the experiment's description.py module
    experimentDir = options.experimentDir
    descriptionPyModule = opfhelpers.loadExperimentDescriptionScriptFromDir(experimentDir)
    expIface = opfhelpers.getExperimentDescriptionInterfaceFromModule(descriptionPyModule)

    # Handle "list checkpoints" request
    if options.privateOptions["listAvailableCheckpoints"]:
        _printAvailableCheckpoints(experimentDir)
        return None

    # Load experiment tasks
    experimentTasks = expIface.getModelControl().get("tasks", [])

    # If the tasks list is empty, and this is a nupic environment description
    # file being run from the OPF, convert it to a simple OPF description file.
    if len(experimentTasks) == 0 and expIface.getModelControl()["environment"] == OpfEnvironment.Nupic:
        expIface.convertNupicEnvToOPF()
        experimentTasks = expIface.getModelControl().get("tasks", [])

    # Handle listTasks
    if options.privateOptions["listTasks"]:
        print "Available tasks:"

        for label in [t["taskLabel"] for t in experimentTasks]:
            print "\t", label

        return None

    # Construct the experiment instance
    if options.privateOptions["runCheckpointName"]:

        assert model is None

        checkpointName = options.privateOptions["runCheckpointName"]

        model = ModelFactory.loadFromCheckpoint(savedModelDir=_getModelCheckpointDir(experimentDir, checkpointName))

    elif model is not None:
        print "Skipping creation of OPFExperiment instance: caller provided his own"
    else:
        modelDescription = expIface.getModelDescription()
        model = ModelFactory.create(modelDescription)

    # Handle "create model" request
    if options.privateOptions["createCheckpointName"]:
        checkpointName = options.privateOptions["createCheckpointName"]
        _saveModel(model=model, experimentDir=experimentDir, checkpointLabel=checkpointName)

        return model

    # Build the task list

    # Default task execution index list is in the natural list order of the tasks
    taskIndexList = range(len(experimentTasks))

    customTaskExecutionLabelsList = options.privateOptions["taskLabels"]
    if customTaskExecutionLabelsList:
        taskLabelsList = [t["taskLabel"] for t in experimentTasks]
        taskLabelsSet = set(taskLabelsList)

        customTaskExecutionLabelsSet = set(customTaskExecutionLabelsList)

        assert customTaskExecutionLabelsSet.issubset(taskLabelsSet), (
            "Some custom-provided task execution labels don't correspond "
            "to actual task labels: mismatched labels: %r; actual task "
            "labels: %r."
        ) % (customTaskExecutionLabelsSet - taskLabelsSet, customTaskExecutionLabelsList)

        taskIndexList = [taskLabelsList.index(label) for label in customTaskExecutionLabelsList]

        print "#### Executing custom task list: %r" % [taskLabelsList[i] for i in taskIndexList]

    # Run all experiment tasks
    for taskIndex in taskIndexList:

        task = experimentTasks[taskIndex]

        # Create a task runner and run it!
        taskRunner = _TaskRunner(model=model, task=task, cmdOptions=options)
        taskRunner.run()
        del taskRunner

        if options.privateOptions["checkpointModel"]:
            _saveModel(model=model, experimentDir=experimentDir, checkpointLabel=task["taskLabel"])

    return model

Example #4

Show file

File: stream_reader.py Project: surajsangavkar/nupic

  def __init__(self, streamDef, bookmark=None, saveOutput=False,
               isBlocking=True, maxTimeout=0, eofOnTimeout=False):
    """ Base class constructor, performs common initialization

    Parameters:
    ----------------------------------------------------------------
    streamDef:  The stream definition, potentially containing multiple sources
                (not supported yet). See
                /nupic/frameworks/opf/jsonschema/stream_def.json for the format
                of this dict

    bookmark: Bookmark to start reading from. This overrides the first_record
                field of the streamDef if provided.

    saveOutput: If true, save the output to a csv file in a temp directory.
                The path to the generated file can be found in the log
                output.

    isBlocking: should read operation block *forever* if the next row of data
                is not available, but the stream is not marked as 'completed'
                yet?

    maxTimeout: if isBlocking is False, max seconds to wait for more data before
                timing out; ignored when isBlocking is True.

    eofOnTimeout: If True and we get a read timeout (isBlocking must be False
                to get read timeouts), assume we've reached the end of the
                input and produce the last aggregated record, if one can be
                completed.

    """

    # Call superclass constructor
    super(StreamReader, self).__init__()

    loggerPrefix = 'com.numenta.nupic.data.StreamReader'
    self._logger = logging.getLogger(loggerPrefix)
    jsonhelpers.validate(streamDef,
                         schemaPath=pkg_resources.resource_filename(
                             jsonschema.__name__, "stream_def.json"))
    assert len(streamDef['streams']) == 1, "Only 1 source stream is supported"

    # Save constructor args
    sourceDict = streamDef['streams'][0]
    self._recordCount = 0
    self._eofOnTimeout = eofOnTimeout
    self._logger.debug('Reading stream with the def: %s', sourceDict)

    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None

    # ---------------------------------------------------------------------
    # Get the stream definition params

    # Limiting window of the stream. It would not return any records until
    # 'first_record' ID is read (or very first with the ID above that). The
    # stream will return EOS once it reads record with ID 'last_record' or
    # above (NOTE: the name 'lastRecord' is misleading because it is NOT
    #  inclusive).
    firstRecordIdx = sourceDict.get('first_record', None)
    self._sourceLastRecordIdx = sourceDict.get('last_record', None)

    # If a bookmark was given, then override first_record from the stream
    #  definition.
    if bookmark is not None:
      firstRecordIdx = None


    # Column names must be provided in the streamdef json
    # Special case is ['*'], meaning all available names from the record stream
    self._streamFieldNames = sourceDict.get('columns', None)
    if self._streamFieldNames != None and self._streamFieldNames[0] == '*':
      self._needFieldsFiltering = False
    else:
      self._needFieldsFiltering = True

    # Types must be specified in streamdef json, or in case of the
    #  file_recod_stream types could be implicit from the file
    streamFieldTypes = sourceDict.get('types', None)
    self._logger.debug('Types from the def: %s', streamFieldTypes)
    # Validate that all types are valid
    if streamFieldTypes is not None:
      for dataType in streamFieldTypes:
        assert FieldMetaType.isValid(dataType)

    # Reset, sequence and time fields might be provided by streamdef json
    streamResetFieldName = streamDef.get('resetField', None)
    streamTimeFieldName = streamDef.get('timeField', None)
    streamSequenceFieldName = streamDef.get('sequenceIdField', None)
    self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName,
                                                      streamTimeFieldName,
                                                      streamSequenceFieldName)


    # =======================================================================
    # Open up the underlying record store
    dataUrl = sourceDict.get('source', None)
    assert dataUrl is not None
    self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout,
                                         bookmark, firstRecordIdx)
    assert self._recordStore is not None


    # =======================================================================
    # Prepare the data structures we need for returning just the fields
    #  the caller wants from each record
    recordStoreFields = self._recordStore.getFields()
    self._recordStoreFieldNames = self._recordStore.getFieldNames()

    if not self._needFieldsFiltering:
      self._streamFieldNames = self._recordStoreFieldNames

    # Build up the field definitions for each field. This is a list of tuples
    #  of (name, type, special)
    self._streamFields = []
    for dstIdx, name in enumerate(self._streamFieldNames):
      if name not in self._recordStoreFieldNames:
        raise RuntimeError("The column '%s' from the stream definition "
          "is not present in the underlying stream which has the following "
          "columns: %s" % (name, self._recordStoreFieldNames))

      fieldIdx = self._recordStoreFieldNames.index(name)
      fieldType = recordStoreFields[fieldIdx].type
      fieldSpecial = recordStoreFields[fieldIdx].special

      # If the types or specials were defined in the stream definition,
      #   then override what was found in the record store
      if streamFieldTypes is not None:
        fieldType = streamFieldTypes[dstIdx]

      if streamResetFieldName is not None and streamResetFieldName == name:
        fieldSpecial = FieldMetaSpecial.reset
      if streamTimeFieldName is not None and streamTimeFieldName == name:
        fieldSpecial = FieldMetaSpecial.timestamp
      if (streamSequenceFieldName is not None and
          streamSequenceFieldName == name):
        fieldSpecial = FieldMetaSpecial.sequence

      self._streamFields.append(FieldMetaInfo(name, fieldType, fieldSpecial))


    # ========================================================================
    # Create the aggregator which will handle aggregation of records before
    #  returning them.
    self._aggregator = Aggregator(
            aggregationInfo=streamDef.get('aggregation', None),
            inputFields=recordStoreFields,
            timeFieldName=streamDef.get('timeField', None),
            sequenceIdFieldName=streamDef.get('sequenceIdField', None),
            resetFieldName=streamDef.get('resetField', None))

    # We rely on the aggregator to tell us the bookmark of the last raw input
    #  that contributed to the aggregated record
    self._aggBookmark = None

    # Compute the aggregation period in terms of months and seconds
    if 'aggregation' in streamDef:
      self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds(
                streamDef.get('aggregation'))
    else:
      self._aggMonthsAndSeconds = None


    # ========================================================================
    # Are we saving the generated output to a csv?
    if saveOutput:
      tmpDir = tempfile.mkdtemp()
      outFilename = os.path.join(tmpDir, "generated_output.csv")
      self._logger.info("StreamReader: Saving generated records to: '%s'" %
                        outFilename)
      self._writer = FileRecordStream(streamID=outFilename,
                                      write=True,
                                      fields=self._streamFields)
    else:
      self._writer = None

Example #5

Show file

def _runExperimentImpl(options, model=None):
    """Creates and runs the experiment

  Args:
    options: namedtuple ParseCommandLineOptionsResult
    model: For testing: may pass in an existing OPF Model instance
        to use instead of creating a new one.

  Returns: reference to OPFExperiment instance that was constructed (this
      is provided to aid with debugging) or None, if none was
      created.
  """
    jsonhelpers.validate(options.privateOptions,
                         schemaDict=g_parsedPrivateCommandLineOptionsSchema)

    # Load the experiment's description.py module
    experimentDir = options.experimentDir
    descriptionPyModule = opf_helpers.loadExperimentDescriptionScriptFromDir(
        experimentDir)
    expIface = opf_helpers.getExperimentDescriptionInterfaceFromModule(
        descriptionPyModule)

    # Handle "list checkpoints" request
    if options.privateOptions['listAvailableCheckpoints']:
        _printAvailableCheckpoints(experimentDir)
        return None

    # Load experiment tasks
    experimentTasks = expIface.getModelControl().get('tasks', [])

    # If the tasks list is empty, and this is a nupic environment description
    # file being run from the OPF, convert it to a simple OPF description file.
    if (len(experimentTasks) == 0 and expIface.getModelControl()['environment']
            == OpfEnvironment.Nupic):
        expIface.convertNupicEnvToOPF()
        experimentTasks = expIface.getModelControl().get('tasks', [])

    # Ensures all the source locations are either absolute paths or relative to
    # the nupic.datafiles package_data location.
    expIface.normalizeStreamSources()

    # Extract option
    newSerialization = options.privateOptions['newSerialization']

    # Handle listTasks
    if options.privateOptions['listTasks']:
        print "Available tasks:"

        for label in [t['taskLabel'] for t in experimentTasks]:
            print "\t", label

        return None

    # Construct the experiment instance
    if options.privateOptions['runCheckpointName']:

        assert model is None

        checkpointName = options.privateOptions['runCheckpointName']

        model = ModelFactory.loadFromCheckpoint(
            savedModelDir=_getModelCheckpointDir(experimentDir,
                                                 checkpointName),
            newSerialization=newSerialization)

    elif model is not None:
        print "Skipping creation of OPFExperiment instance: caller provided his own"
    else:
        modelDescription = expIface.getModelDescription()
        model = ModelFactory.create(modelDescription)

    # Handle "create model" request
    if options.privateOptions['createCheckpointName']:
        checkpointName = options.privateOptions['createCheckpointName']
        _saveModel(model=model,
                   experimentDir=experimentDir,
                   checkpointLabel=checkpointName,
                   newSerialization=newSerialization)

        return model

    # Build the task list

    # Default task execution index list is in the natural list order of the tasks
    taskIndexList = range(len(experimentTasks))

    customTaskExecutionLabelsList = options.privateOptions['taskLabels']
    if customTaskExecutionLabelsList:
        taskLabelsList = [t['taskLabel'] for t in experimentTasks]
        taskLabelsSet = set(taskLabelsList)

        customTaskExecutionLabelsSet = set(customTaskExecutionLabelsList)

        assert customTaskExecutionLabelsSet.issubset(taskLabelsSet), \
               ("Some custom-provided task execution labels don't correspond "
                "to actual task labels: mismatched labels: %r; actual task "
                "labels: %r.") % (customTaskExecutionLabelsSet - taskLabelsSet,
                                  customTaskExecutionLabelsList)

        taskIndexList = [
            taskLabelsList.index(label)
            for label in customTaskExecutionLabelsList
        ]

        print "#### Executing custom task list: %r" % [
            taskLabelsList[i] for i in taskIndexList
        ]

    # Run all experiment tasks
    for taskIndex in taskIndexList:

        task = experimentTasks[taskIndex]

        # Create a task runner and run it!
        taskRunner = _TaskRunner(model=model, task=task, cmdOptions=options)
        taskRunner.run()
        del taskRunner

        if options.privateOptions['checkpointModel']:
            _saveModel(model=model,
                       experimentDir=experimentDir,
                       checkpointLabel=task['taskLabel'],
                       newSerialization=newSerialization)

    return model

Example #6

Show file

File: SwarmWorker.py Project: zacg/nupic

  def run(self):
    """ Run this worker.

    Parameters:
    ----------------------------------------------------------------------
    retval:     jobID of the job we ran. This is used by unit test code
                  when calling this working using the --params command
                  line option (which tells this worker to insert the job
                  itself).
    """
    # Easier access to options
    options = self._options

    # ---------------------------------------------------------------------
    # Connect to the jobs database
    self.logger.info("Connecting to the jobs database")
    cjDAO = ClientJobsDAO.get()

    # Get our worker ID
    self._workerID = cjDAO.getConnectionID()

    if options.clearModels:
      cjDAO.modelsClearAll()

    # -------------------------------------------------------------------------
    # if params were specified on the command line, insert a new job using
    #  them.
    if options.params is not None:
      options.jobID = cjDAO.jobInsert(client='hwTest', cmdLine="echo 'test mode'",
                  params=options.params, alreadyRunning=True,
                  minimumWorkers=1, maximumWorkers=1,
                  jobType = cjDAO.JOB_TYPE_HS)
    if options.workerID is not None:
      wID = options.workerID
    else:
      wID = self._workerID
    
    buildID = Configuration.get('nupic.software.buildNumber', 'N/A')
    logPrefix = '<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> ' % \
                (buildID, wID, options.jobID)
    ExtendedLogger.setLogPrefix(logPrefix)

    # ---------------------------------------------------------------------
    # Get the search parameters
    # If asked to reset the job status, do that now
    if options.resetJobStatus:
      cjDAO.jobSetFields(options.jobID,
           fields={'workerCompletionReason': ClientJobsDAO.CMPL_REASON_SUCCESS,
                   'cancel': False,
                   #'engWorkerState': None
                   },
           useConnectionID=False,
           ignoreUnchanged=True)
    jobInfo = cjDAO.jobInfo(options.jobID)
    self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo))))


    # ---------------------------------------------------------------------
    # Instantiate the swarm object, which will handle the logic of
    #  which models to create when we need more to evaluate.
    jobParams = json.loads(jobInfo.params)

    # Validate job params
    jsonSchemaPath = os.path.join(os.path.dirname(__file__),
                                  "jsonschema",
                                  "jobParamsSchema.json")
    jsonhelpers.validate(jobParams, schemaPath=jsonSchemaPath)


    hsVersion = jobParams.get('hsVersion', None)
    if hsVersion == 'v2':
      self._hs = SwarmV2(searchParams=jobParams, workerID=self._workerID,
              cjDAO=cjDAO, jobID=options.jobID, logLevel=options.logLevel)
    else:
      raise RuntimeError("Invalid swarm implementation (%s) specified" \
                          % (hsVersion))


    # =====================================================================
    # The main loop.
    try:
      exit = False
      numModelsTotal = 0
      print >>sys.stderr, "reporter:status:Evaluating first model..."
      while not exit:

        # ------------------------------------------------------------------
        # Choose a model to evaluate
        batchSize = 10              # How many to try at a time.
        modelIDToRun = None
        while modelIDToRun is None:

          if options.modelID is None:
            # -----------------------------------------------------------------
            # Get the latest results on all running models and send them to
            #  the swarm implementation
            # This calls cjDAO.modelsGetUpdateCounters(), compares the
            # updateCounters with what we have cached, fetches the results for the
            # changed and new models, and sends those to the swarm
            # implementation's self._hs.recordModelProgress() method.
            self._processUpdatedModels(cjDAO)
  
            # --------------------------------------------------------------------
            # Create a new batch of models
            (exit, newModels) = self._hs.createModels(numModels = batchSize)
            if exit:
              break

            # No more models left to create, just loop. The _hs is waiting for
            #   all remaining running models to complete, and may pick up on an
            #  orphan if it detects one.
            if len(newModels) == 0:
              continue
  
            # Try and insert one that we will run
            for (modelParams, modelParamsHash, particleHash) in newModels:
              jsonModelParams = json.dumps(modelParams)
              (modelID, ours) = cjDAO.modelInsertAndStart(options.jobID,
                                  jsonModelParams, modelParamsHash, particleHash)
  
              # Some other worker is already running it, tell the swarm object
              #  so that it doesn't try and insert it again
              if not ours:
                mParamsAndHash = cjDAO.modelsGetParams([modelID])[0]
                mResult = cjDAO.modelsGetResultAndStatus([modelID])[0]
                results = mResult.results
                if results is not None:
                  results = json.loads(results)
  
                modelParams = json.loads(mParamsAndHash.params)
                particleHash = cjDAO.modelsGetFields(modelID, 
                                  ['engParticleHash'])[0]
                particleInst = "%s.%s" % (
                          modelParams['particleState']['id'],
                          modelParams['particleState']['genIdx'])
                self.logger.info("Adding model %d to our internal DB " \
                      "because modelInsertAndStart() failed to insert it: " \
                      "paramsHash=%s, particleHash=%s, particleId='%s'", modelID, 
                      mParamsAndHash.engParamsHash.encode('hex'),
                      particleHash.encode('hex'), particleInst)
                self._hs.recordModelProgress(modelID = modelID,
                      modelParams = modelParams,
                      modelParamsHash = mParamsAndHash.engParamsHash,
                      results = results,
                      completed = (mResult.status == cjDAO.STATUS_COMPLETED),
                      completionReason = mResult.completionReason,
                      matured = mResult.engMatured,
                      numRecords = mResult.numRecords)
              else:
                modelIDToRun = modelID
                break
  
          else:
            # A specific modelID was passed on the command line
            modelIDToRun = int(options.modelID)
            mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun])[0]
            modelParams = json.loads(mParamsAndHash.params)
            modelParamsHash = mParamsAndHash.engParamsHash
            
            # Make us the worker
            cjDAO.modelSetFields(modelIDToRun,
                                     dict(engWorkerConnId=self._workerID))
            if False:
              # Change the hash and params of the old entry so that we can
              #  create a new model with the same params
              for attempt in range(1000):
                paramsHash = hashlib.md5("OrphanParams.%d.%d" % (modelIDToRun,
                                                                 attempt)).digest()
                particleHash = hashlib.md5("OrphanParticle.%d.%d" % (modelIDToRun,
                                                                  attempt)).digest()
                try:
                  cjDAO.modelSetFields(modelIDToRun,
                                           dict(engParamsHash=paramsHash,
                                                engParticleHash=particleHash))
                  success = True
                except:
                  success = False
                if success:
                  break
              if not success:
                raise RuntimeError("Unexpected failure to change paramsHash and "
                                   "particleHash of orphaned model")
              
              (modelIDToRun, ours) = cjDAO.modelInsertAndStart(options.jobID,
                                  mParamsAndHash.params, modelParamsHash)

            
            
            # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

        # ---------------------------------------------------------------
        # We have a model, evaluate it now
        # All done?
        if exit:
          break

        # Run the model now
        self.logger.info("RUNNING MODEL GID=%d, paramsHash=%s, params=%s",
              modelIDToRun, modelParamsHash.encode('hex'), modelParams)

        # ---------------------------------------------------------------------
        # Construct model checkpoint GUID for this model:
        # jobParams['persistentJobGUID'] contains the client's (e.g., API Server)
        # persistent, globally-unique model identifier, which is what we need;
        persistentJobGUID = jobParams['persistentJobGUID']
        assert persistentJobGUID, "persistentJobGUID: %r" % (persistentJobGUID,)

        modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + (
          '_' + str(modelIDToRun))


        self._hs.runModel(modelID=modelIDToRun, jobID = options.jobID,
                          modelParams=modelParams, modelParamsHash=modelParamsHash,
                          jobsDAO=cjDAO, modelCheckpointGUID=modelCheckpointGUID)

        # TODO: don't increment for orphaned models
        numModelsTotal += 1

        self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs",
          modelIDToRun, numModelsTotal)
        print >>sys.stderr, "reporter:status:Evaluated %d models..." % \
                                    (numModelsTotal)
        print >>sys.stderr, "reporter:counter:swarmWorker,numModels,1"

        if options.modelID is not None:
          exit = True
        # ^^^ end while not exit

    finally:
      # Provide swarm instance an opportunity to clean up temporary files
      self._hs.close()

    self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal))
    print >>sys.stderr, "reporter:status:Finished, evaluated %d models" % (numModelsTotal)
    return options.jobID

Example #7

Show file

    def __init__(self,
                 streamDef,
                 bookmark=None,
                 saveOutput=False,
                 isBlocking=True,
                 maxTimeout=0,
                 eofOnTimeout=False):
        """ Base class constructor, performs common initialization

    Parameters:
    ----------------------------------------------------------------
    streamDef:  The stream definition, potentially containing multiple sources
                (not supported yet). See
                /nupic/frameworks/opf/jsonschema/stream_def.json for the format
                of this dict

    bookmark: Bookmark to start reading from. This overrides the first_record
                field of the streamDef if provided.

    saveOutput: If true, save the output to a csv file in a temp directory.
                The path to the generated file can be found in the log
                output.

    isBlocking: should read operation block *forever* if the next row of data
                is not available, but the stream is not marked as 'completed'
                yet?

    maxTimeout: if isBlocking is False, max seconds to wait for more data before
                timing out; ignored when isBlocking is True.

    eofOnTimeout: If True and we get a read timeout (isBlocking must be False
                to get read timeouts), assume we've reached the end of the
                input and produce the last aggregated record, if one can be
                completed.

    """

        # Call superclass constructor
        super(StreamReader, self).__init__()

        loggerPrefix = 'com.numenta.nupic.data.StreamReader'
        self._logger = logging.getLogger(loggerPrefix)
        jsonhelpers.validate(streamDef,
                             schemaPath=pkg_resources.resource_filename(
                                 jsonschema.__name__, "stream_def.json"))
        assert len(
            streamDef['streams']) == 1, "Only 1 source stream is supported"

        # Save constructor args
        sourceDict = streamDef['streams'][0]
        self._recordCount = 0
        self._eofOnTimeout = eofOnTimeout
        self._logger.debug('Reading stream with the def: %s', sourceDict)

        # Dictionary to store record statistics (min and max of scalars for now)
        self._stats = None

        # ---------------------------------------------------------------------
        # Get the stream definition params

        # Limiting window of the stream. It would not return any records until
        # 'first_record' ID is read (or very first with the ID above that). The
        # stream will return EOS once it reads record with ID 'last_record' or
        # above (NOTE: the name 'lastRecord' is misleading because it is NOT
        #  inclusive).
        firstRecordIdx = sourceDict.get('first_record', None)
        self._sourceLastRecordIdx = sourceDict.get('last_record', None)

        # If a bookmark was given, then override first_record from the stream
        #  definition.
        if bookmark is not None:
            firstRecordIdx = None

        # Column names must be provided in the streamdef json
        # Special case is ['*'], meaning all available names from the record stream
        self._streamFieldNames = sourceDict.get('columns', None)
        if self._streamFieldNames != None and self._streamFieldNames[0] == '*':
            self._needFieldsFiltering = False
        else:
            self._needFieldsFiltering = True

        # Types must be specified in streamdef json, or in case of the
        #  file_recod_stream types could be implicit from the file
        streamFieldTypes = sourceDict.get('types', None)
        self._logger.debug('Types from the def: %s', streamFieldTypes)
        # Validate that all types are valid
        if streamFieldTypes is not None:
            for dataType in streamFieldTypes:
                assert FieldMetaType.isValid(dataType)

        # Reset, sequence and time fields might be provided by streamdef json
        streamResetFieldName = streamDef.get('resetField', None)
        streamTimeFieldName = streamDef.get('timeField', None)
        streamSequenceFieldName = streamDef.get('sequenceIdField', None)
        self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName,
                           streamTimeFieldName, streamSequenceFieldName)

        # =======================================================================
        # Open up the underlying record store
        dataUrl = sourceDict.get('source', None)
        assert dataUrl is not None
        self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout,
                                             bookmark, firstRecordIdx)
        assert self._recordStore is not None

        # =======================================================================
        # Prepare the data structures we need for returning just the fields
        #  the caller wants from each record
        recordStoreFields = self._recordStore.getFields()
        self._recordStoreFieldNames = self._recordStore.getFieldNames()

        if not self._needFieldsFiltering:
            self._streamFieldNames = self._recordStoreFieldNames

        # Build up the field definitions for each field. This is a list of tuples
        #  of (name, type, special)
        self._streamFields = []
        for dstIdx, name in enumerate(self._streamFieldNames):
            if name not in self._recordStoreFieldNames:
                raise RuntimeError(
                    "The column '%s' from the stream definition "
                    "is not present in the underlying stream which has the following "
                    "columns: %s" % (name, self._recordStoreFieldNames))

            fieldIdx = self._recordStoreFieldNames.index(name)
            fieldType = recordStoreFields[fieldIdx].type
            fieldSpecial = recordStoreFields[fieldIdx].special

            # If the types or specials were defined in the stream definition,
            #   then override what was found in the record store
            if streamFieldTypes is not None:
                fieldType = streamFieldTypes[dstIdx]

            if streamResetFieldName is not None and streamResetFieldName == name:
                fieldSpecial = FieldMetaSpecial.reset
            if streamTimeFieldName is not None and streamTimeFieldName == name:
                fieldSpecial = FieldMetaSpecial.timestamp
            if (streamSequenceFieldName is not None
                    and streamSequenceFieldName == name):
                fieldSpecial = FieldMetaSpecial.sequence

            self._streamFields.append(
                FieldMetaInfo(name, fieldType, fieldSpecial))

        # ========================================================================
        # Create the aggregator which will handle aggregation of records before
        #  returning them.
        self._aggregator = Aggregator(
            aggregationInfo=streamDef.get('aggregation', None),
            inputFields=recordStoreFields,
            timeFieldName=streamDef.get('timeField', None),
            sequenceIdFieldName=streamDef.get('sequenceIdField', None),
            resetFieldName=streamDef.get('resetField', None))

        # We rely on the aggregator to tell us the bookmark of the last raw input
        #  that contributed to the aggregated record
        self._aggBookmark = None

        # Compute the aggregation period in terms of months and seconds
        if 'aggregation' in streamDef:
            self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds(
                streamDef.get('aggregation'))
        else:
            self._aggMonthsAndSeconds = None

        # ========================================================================
        # Are we saving the generated output to a csv?
        if saveOutput:
            tmpDir = tempfile.mkdtemp()
            outFilename = os.path.join(tmpDir, "generated_output.csv")
            self._logger.info(
                "StreamReader: Saving generated records to: '%s'" %
                outFilename)
            self._writer = FileRecordStream(streamID=outFilename,
                                            write=True,
                                            fields=self._streamFields)
        else:
            self._writer = None

Example #8

Show file

    def run(self):
        """ Run this worker.

    Parameters:
    ----------------------------------------------------------------------
    retval:     jobID of the job we ran. This is used by unit test code
                  when calling this working using the --params command
                  line option (which tells this worker to insert the job
                  itself).
    """
        # Easier access to options
        options = self._options

        # ---------------------------------------------------------------------
        # Connect to the jobs database
        self.logger.info("Connecting to the jobs database")
        cjDAO = ClientJobsDAO.get()

        # Get our worker ID
        self._workerID = cjDAO.getConnectionID()

        if options.clearModels:
            cjDAO.modelsClearAll()

        # -------------------------------------------------------------------------
        # if params were specified on the command line, insert a new job using
        #  them.
        if options.params is not None:
            options.jobID = cjDAO.jobInsert(client='hwTest',
                                            cmdLine="echo 'test mode'",
                                            params=options.params,
                                            alreadyRunning=True,
                                            minimumWorkers=1,
                                            maximumWorkers=1,
                                            jobType=cjDAO.JOB_TYPE_HS)
        if options.workerID is not None:
            wID = options.workerID
        else:
            wID = self._workerID

        buildID = Configuration.get('nupic.software.buildNumber', 'N/A')
        logPrefix = '<BUILDID=%s, WORKER=HW, WRKID=%s, JOBID=%s> ' % \
                    (buildID, wID, options.jobID)
        ExtendedLogger.setLogPrefix(logPrefix)

        # ---------------------------------------------------------------------
        # Get the search parameters
        # If asked to reset the job status, do that now
        if options.resetJobStatus:
            cjDAO.jobSetFields(
                options.jobID,
                fields={
                    'workerCompletionReason':
                    ClientJobsDAO.CMPL_REASON_SUCCESS,
                    'cancel': False,
                    #'engWorkerState': None
                },
                useConnectionID=False,
                ignoreUnchanged=True)
        jobInfo = cjDAO.jobInfo(options.jobID)
        self.logger.info("Job info retrieved: %s" % (str(clippedObj(jobInfo))))

        # ---------------------------------------------------------------------
        # Instantiate the swarm object, which will handle the logic of
        #  which models to create when we need more to evaluate.
        jobParams = json.loads(jobInfo.params)

        # Validate job params
        jsonSchemaPath = os.path.join(os.path.dirname(__file__), "jsonschema",
                                      "jobParamsSchema.json")
        jsonhelpers.validate(jobParams, schemaPath=jsonSchemaPath)

        hsVersion = jobParams.get('hsVersion', None)
        if hsVersion == 'v2':
            self._hs = SwarmV2(searchParams=jobParams,
                               workerID=self._workerID,
                               cjDAO=cjDAO,
                               jobID=options.jobID,
                               logLevel=options.logLevel)
        else:
            raise RuntimeError("Invalid swarm implementation (%s) specified" \
                                % (hsVersion))

        # =====================================================================
        # The main loop.
        try:
            exit = False
            numModelsTotal = 0
            print >> sys.stderr, "reporter:status:Evaluating first model..."
            while not exit:

                # ------------------------------------------------------------------
                # Choose a model to evaluate
                batchSize = 10  # How many to try at a time.
                modelIDToRun = None
                while modelIDToRun is None:

                    if options.modelID is None:
                        # -----------------------------------------------------------------
                        # Get the latest results on all running models and send them to
                        #  the swarm implementation
                        # This calls cjDAO.modelsGetUpdateCounters(), compares the
                        # updateCounters with what we have cached, fetches the results for the
                        # changed and new models, and sends those to the swarm
                        # implementation's self._hs.recordModelProgress() method.
                        self._processUpdatedModels(cjDAO)

                        # --------------------------------------------------------------------
                        # Create a new batch of models
                        (exit, newModels) = self._hs.createModels(
                            numModels=batchSize)
                        if exit:
                            break

                        # No more models left to create, just loop. The _hs is waiting for
                        #   all remaining running models to complete, and may pick up on an
                        #  orphan if it detects one.
                        if len(newModels) == 0:
                            continue

                        # Try and insert one that we will run
                        for (modelParams, modelParamsHash,
                             particleHash) in newModels:
                            jsonModelParams = json.dumps(modelParams)
                            (modelID, ours) = cjDAO.modelInsertAndStart(
                                options.jobID, jsonModelParams,
                                modelParamsHash, particleHash)

                            # Some other worker is already running it, tell the swarm object
                            #  so that it doesn't try and insert it again
                            if not ours:
                                mParamsAndHash = cjDAO.modelsGetParams(
                                    [modelID])[0]
                                mResult = cjDAO.modelsGetResultAndStatus(
                                    [modelID])[0]
                                results = mResult.results
                                if results is not None:
                                    results = json.loads(results)

                                modelParams = json.loads(mParamsAndHash.params)
                                particleHash = cjDAO.modelsGetFields(
                                    modelID, ['engParticleHash'])[0]
                                particleInst = "%s.%s" % (
                                    modelParams['particleState']['id'],
                                    modelParams['particleState']['genIdx'])
                                self.logger.info("Adding model %d to our internal DB " \
                                      "because modelInsertAndStart() failed to insert it: " \
                                      "paramsHash=%s, particleHash=%s, particleId='%s'", modelID,
                                      mParamsAndHash.engParamsHash.encode('hex'),
                                      particleHash.encode('hex'), particleInst)
                                self._hs.recordModelProgress(
                                    modelID=modelID,
                                    modelParams=modelParams,
                                    modelParamsHash=mParamsAndHash.
                                    engParamsHash,
                                    results=results,
                                    completed=(mResult.status ==
                                               cjDAO.STATUS_COMPLETED),
                                    completionReason=mResult.completionReason,
                                    matured=mResult.engMatured,
                                    numRecords=mResult.numRecords)
                            else:
                                modelIDToRun = modelID
                                break

                    else:
                        # A specific modelID was passed on the command line
                        modelIDToRun = int(options.modelID)
                        mParamsAndHash = cjDAO.modelsGetParams([modelIDToRun
                                                                ])[0]
                        modelParams = json.loads(mParamsAndHash.params)
                        modelParamsHash = mParamsAndHash.engParamsHash

                        # Make us the worker
                        cjDAO.modelSetFields(
                            modelIDToRun, dict(engWorkerConnId=self._workerID))
                        if False:
                            # Change the hash and params of the old entry so that we can
                            #  create a new model with the same params
                            for attempt in range(1000):
                                paramsHash = hashlib.md5(
                                    "OrphanParams.%d.%d" %
                                    (modelIDToRun, attempt)).digest()
                                particleHash = hashlib.md5(
                                    "OrphanParticle.%d.%d" %
                                    (modelIDToRun, attempt)).digest()
                                try:
                                    cjDAO.modelSetFields(
                                        modelIDToRun,
                                        dict(engParamsHash=paramsHash,
                                             engParticleHash=particleHash))
                                    success = True
                                except:
                                    success = False
                                if success:
                                    break
                            if not success:
                                raise RuntimeError(
                                    "Unexpected failure to change paramsHash and "
                                    "particleHash of orphaned model")

                            (modelIDToRun, ours) = cjDAO.modelInsertAndStart(
                                options.jobID, mParamsAndHash.params,
                                modelParamsHash)

                        # ^^^ end while modelIDToRun ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

                # ---------------------------------------------------------------
                # We have a model, evaluate it now
                # All done?
                if exit:
                    break

                # Run the model now
                self.logger.info(
                    "RUNNING MODEL GID=%d, paramsHash=%s, params=%s",
                    modelIDToRun, modelParamsHash.encode('hex'), modelParams)

                # ---------------------------------------------------------------------
                # Construct model checkpoint GUID for this model:
                # jobParams['persistentJobGUID'] contains the client's (e.g., API Server)
                # persistent, globally-unique model identifier, which is what we need;
                persistentJobGUID = jobParams['persistentJobGUID']
                assert persistentJobGUID, "persistentJobGUID: %r" % (
                    persistentJobGUID, )

                modelCheckpointGUID = jobInfo.client + "_" + persistentJobGUID + (
                    '_' + str(modelIDToRun))

                self._hs.runModel(modelID=modelIDToRun,
                                  jobID=options.jobID,
                                  modelParams=modelParams,
                                  modelParamsHash=modelParamsHash,
                                  jobsDAO=cjDAO,
                                  modelCheckpointGUID=modelCheckpointGUID)

                # TODO: don't increment for orphaned models
                numModelsTotal += 1

                self.logger.info("COMPLETED MODEL GID=%d; EVALUATED %d MODELs",
                                 modelIDToRun, numModelsTotal)
                print >>sys.stderr, "reporter:status:Evaluated %d models..." % \
                                            (numModelsTotal)
                print >> sys.stderr, "reporter:counter:swarmWorker,numModels,1"

                if options.modelID is not None:
                    exit = True
                # ^^^ end while not exit

        finally:
            # Provide swarm instance an opportunity to clean up temporary files
            self._hs.close()

        self.logger.info("FINISHED. Evaluated %d models." % (numModelsTotal))
        print >> sys.stderr, "reporter:status:Finished, evaluated %d models" % (
            numModelsTotal)
        return options.jobID

Example #9

Show file

File: stream_reader.py Project: Alleyfield/nupic

  def __init__(self, streamDef, bookmark=None, saveOutput=False,
               isBlocking=True, maxTimeout=0, eofOnTimeout=False):
    """ Base class constructor, performs common initialization

    Parameters:
    ----------------------------------------------------------------
    streamDef:  The stream definition, potentially containing multiple sources
                (not supported yet). See
                /nupic/frameworks/opf/jsonschema/stream_def.json for the format
                of this dict

    bookmark: Bookmark to start reading from. This overrides the first_record
                field of the streamDef if provided.

    saveOutput: If true, save the output to a csv file in a temp directory.
                The path to the generated file can be found in the log
                output.

    isBlocking: should read operation block *forever* if the next row of data
                is not available, but the stream is not marked as 'completed'
                yet?

    maxTimeout: if isBlocking is False, max seconds to wait for more data before
                timing out; ignored when isBlocking is True.

    eofOnTimeout: If True and we get a read timeout (isBlocking must be False
                to get read timeouts), assume we've reached the end of the
                input and produce the last aggregated record, if one can be
                completed.

    """

    # Call superclass constructor
    super(StreamReader, self).__init__()

    loggerPrefix = 'com.numenta.nupic.data.StreamReader'
    self._logger = logging.getLogger(loggerPrefix)
    jsonhelpers.validate(streamDef,
                         schemaPath=resource_filename(
                             jsonschema.__name__, "stream_def.json"))
    assert len(streamDef['streams']) == 1, "Only 1 source stream is supported"

    # Save constructor args
    sourceDict = streamDef['streams'][0]
    self._recordCount = 0
    self._eofOnTimeout = eofOnTimeout
    self._logger.debug('Reading stream with the def: %s', sourceDict)

    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None

    # ---------------------------------------------------------------------
    # Get the stream definition params

    # Limiting window of the stream. It would not return any records until
    # 'first_record' ID is read (or very first with the ID above that). The
    # stream will return EOS once it reads record with ID 'last_record' or
    # above (NOTE: the name 'lastRecord' is misleading because it is NOT
    #  inclusive).
    firstRecordIdx = sourceDict.get('first_record', None)
    self._sourceLastRecordIdx = sourceDict.get('last_record', None)

    # If a bookmark was given, then override first_record from the stream
    #  definition.
    if bookmark is not None:
      firstRecordIdx = None


    # Column names must be provided in the streamdef json
    # Special case is ['*'], meaning all available names from the record stream
    self._streamFieldNames = sourceDict.get('columns', None)
    if self._streamFieldNames != None and self._streamFieldNames[0] == '*':
      self._needFieldsFiltering = False
    else:
      self._needFieldsFiltering = True

    # Types must be specified in streamdef json, or in case of the
    #  file_recod_stream types could be implicit from the file
    streamFieldTypes = sourceDict.get('types', None)
    self._logger.debug('Types from the def: %s', streamFieldTypes)
    # Validate that all types are valid
    if streamFieldTypes != None:
      for dataType in streamFieldTypes:
        assert(dataType in TYPES)

    # Reset, sequence and time fields might be provided by streamdef json
    streamResetFieldName = streamDef.get('resetField', None)
    streamTimeFieldName = streamDef.get('timeField', None)
    streamSequenceFieldName = streamDef.get('sequenceIdField', None)
    self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName,
                                                      streamTimeFieldName,
                                                      streamSequenceFieldName)


    # ================================