Beispiel #1
0
    def testBadDataset(self):

        filename = _getTempFileName()

        print 'Creating tempfile:', filename

        # Write bad dataset with records going backwards in time
        fields = [
            FieldMetaInfo('timestamp', FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp)
        ]
        o = FileRecordStream(streamID=filename, write=True, fields=fields)
        # Records
        records = ([datetime(day=3, month=3, year=2010)],
                   [datetime(day=2, month=3, year=2010)])

        o.appendRecord(records[0])
        o.appendRecord(records[1])
        o.close()

        # Write bad dataset with broken sequences
        fields = [
            FieldMetaInfo('sid', FieldMetaType.integer,
                          FieldMetaSpecial.sequence)
        ]
        o = FileRecordStream(streamID=filename, write=True, fields=fields)
        # Records
        records = ([1], [2], [1])

        o.appendRecord(records[0])
        o.appendRecord(records[1])
        self.assertRaises(Exception, o.appendRecord, (records[2], ))
        o.close()
  def __getListMetaInfo(self, inferenceElement):
    """ Get field metadata information for inferences that are of list type
    TODO: Right now we assume list inferences are associated with the input field
    metadata
    """
    fieldMetaInfo = []
    inferenceLabel = InferenceElement.getLabel(inferenceElement)

    for inputFieldMeta in self.__inputFieldsMeta:
      if InferenceElement.getInputElement(inferenceElement):
        outputFieldMeta = FieldMetaInfo(
          name=inputFieldMeta.name + ".actual",
          type=inputFieldMeta.type,
          special=inputFieldMeta.special
        )

      predictionField = FieldMetaInfo(
        name=inputFieldMeta.name + "." + inferenceLabel,
        type=inputFieldMeta.type,
        special=inputFieldMeta.special
      )

      fieldMetaInfo.append(outputFieldMeta)
      fieldMetaInfo.append(predictionField)

    return fieldMetaInfo
Beispiel #3
0
    def testEncoderWithoutResetAndSequenceFields(self):
        fields = [
            FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none),
            FieldMetaInfo('timestamp', FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp),
            FieldMetaInfo('integer', FieldMetaType.integer,
                          FieldMetaSpecial.none),
            FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none),
            FieldMetaInfo('categories', FieldMetaType.list,
                          FieldMetaSpecial.category)
        ]

        encoder = ModelRecordEncoder(fields=fields)

        result = encoder.encode(
            ['rec_1',
             datetime(day=1, month=3, year=2010), 5, 6.5, [0, 1, 2]])

        self.assertEqual(
            result, {
                'name': 'rec_1',
                'timestamp': datetime(2010, 3, 1, 0, 0),
                'integer': 5,
                'real': 6.5,
                'categories': [0, 1, 2],
                '_category': [0, 1, 2],
                '_reset': 0,
                '_sequenceId': 0,
                '_timestamp': datetime(2010, 3, 1, 0, 0),
                '_timestampRecordIdx': None
            })

        # One more time to verify that sequence id is still 0
        result = encoder.encode(
            ['rec_2',
             datetime(day=2, month=3, year=2010), 5, 6.5, [0, 1, 2]])

        self.assertEqual(
            result, {
                'name': 'rec_2',
                'timestamp': datetime(2010, 3, 2, 0, 0),
                'integer': 5,
                'real': 6.5,
                'categories': [0, 1, 2],
                '_category': [0, 1, 2],
                '_reset': 0,
                '_sequenceId': 0,
                '_timestamp': datetime(2010, 3, 2, 0, 0),
                '_timestampRecordIdx': None
            })
Beispiel #4
0
  def testFieldMetaInfo(self):
    # Create a single FieldMetaInfo instance from a File field"s meta-data tuple
    e = ("pounds", FieldMetaType.float, FieldMetaSpecial.none)
    m = FieldMetaInfo.createFromFileFieldElement(e)

    self.assertEqual(e, m)

    # Create a list of FieldMetaInfo instances from a list of File meta-data
    # tuples
    el = [("pounds", FieldMetaType.float, FieldMetaSpecial.none),
          ("price", FieldMetaType.float, FieldMetaSpecial.none),
          ("id", FieldMetaType.string, FieldMetaSpecial.sequence),
          ("date", FieldMetaType.datetime, FieldMetaSpecial.timestamp),
         ]
    ml = FieldMetaInfo.createListFromFileFieldList(el)

    self.assertEqual(el, ml)
 def getDatasetFieldMetaData(self):
   """ [virtual method override]
   Returns:      a tuple of dataset field metadata descriptors that are
                 arranged in the same order as the columns in the dataset.
                 Each field metadata descriptor is of type
                 nupic.data.fieldmeta.FieldMetaInfo
   """
   return FieldMetaInfo.createListFromFileFieldList(self._reader.getFields())
Beispiel #6
0
  def testFieldMetaInfo(self):
    # Create a single FieldMetaInfo instance from a File field"s meta-data tuple
    e = ("pounds", FieldMetaType.float, FieldMetaSpecial.none)
    m = FieldMetaInfo.createFromFileFieldElement(e)

    self.assertEqual(e, m)

    # Create a list of FieldMetaInfo instances from a list of File meta-data
    # tuples
    el = [("pounds", FieldMetaType.float, FieldMetaSpecial.none),
          ("price", FieldMetaType.float, FieldMetaSpecial.none),
          ("id", FieldMetaType.string, FieldMetaSpecial.sequence),
          ("date", FieldMetaType.datetime, FieldMetaSpecial.timestamp),
         ]
    ml = FieldMetaInfo.createListFromFileFieldList(el)

    self.assertEqual(el, ml)
Beispiel #7
0
 def getDatasetFieldMetaData(self):
   """ [virtual method override]
   Returns:      a tuple of dataset field metadata descriptors that are
                 arranged in the same order as the columns in the dataset.
                 Each field metadata descriptor is of type
                 nupic.data.fieldmeta.FieldMetaInfo
   """
   return FieldMetaInfo.createListFromFileFieldList(self._reader.getFields())
  def __getDictMetaInfo(self, inferenceElement, inferenceDict):
    """Get field metadate information for inferences that are of dict type"""
    fieldMetaInfo = []
    inferenceLabel = InferenceElement.getLabel(inferenceElement)

    if InferenceElement.getInputElement(inferenceElement):
      fieldMetaInfo.append(FieldMetaInfo(name=inferenceLabel+".actual",
                                         type=FieldMetaType.string,
                                         special = ''))

    keys = sorted(inferenceDict.keys())
    for key in keys:
      fieldMetaInfo.append(FieldMetaInfo(name=inferenceLabel+"."+str(key),
                                         type=FieldMetaType.string,
                                         special=''))


    return fieldMetaInfo
Beispiel #9
0
    def testRewindBeforeModelRecordEncoderIsCreated(self):
        fields = [
            FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none),
        ]

        stream = self.MyRecordStream(fields)

        # Check that it doesn't crash by trying to operate on an absent encoder
        self.assertIsNone(stream._modelRecordEncoder)
        stream.rewind()
Beispiel #10
0
    def __init__(self, fn, iterations, historyBuffer=None):

        random.seed(42)

        self.nIterations = iterations
        self.fn = fn
        self.iterations = iter(xrange(iterations))
        self.history = historyBuffer

        firstRecord = fn(0)
        fieldList = [(name, self.__getType(val), '')
                     for name, val in firstRecord.iteritems()]

        self.__metaData = FieldMetaInfo.createListFromFileFieldList(fieldList)
def _testTemporalShift():
    """ Test to see if the metrics manager correctly shifts records for multistep
  prediction cases
  """
    print "*Testing Multistep temporal shift*..."
    from nupic.data.fieldmeta import (FieldMetaInfo, FieldMetaType,
                                      FieldMetaSpecial)

    from nupic.frameworks.opf.metrics import MetricSpec
    from nupic.frameworks.opf.opf_utils import ModelResult, SensorInput
    onlineMetrics = ()

    modelFieldMetaInfo = (FieldMetaInfo(name='consumption',
                                        type=FieldMetaType.float,
                                        special=FieldMetaSpecial.none), )

    mgr = MetricsManager(metricSpecs=onlineMetrics,
                         fieldInfo=modelFieldMetaInfo,
                         inferenceType=InferenceType.TemporalMultiStep)

    groundTruths = [{'consumption': i} for i in range(10)]
    oneStepInfs = reversed(range(10))
    threeStepInfs = range(5, 15)

    for iterNum, gt, os, ts in zip(xrange(10), groundTruths, oneStepInfs,
                                   threeStepInfs):
        inferences = {InferenceElement.multiStepPredictions: {1: os, 3: ts}}
        sensorInput = SensorInput(dataDict=[gt])
        result = ModelResult(sensorInput=sensorInput, inferences=inferences)
        mgr.update(result)

        assert mgr._getGroundTruth(
            InferenceElement.multiStepPredictions)[0] == gt
        if iterNum < 1:
            #assert mgr._getInference(InferenceElement.multiStepPredictions) is None
            assert mgr._getInference(
                InferenceElement.multiStepPredictions)[1] is None
        else:
            prediction = mgr._getInference(
                InferenceElement.multiStepPredictions)[1]
            assert prediction == 10 - iterNum

        if iterNum < 3:
            inference = mgr._getInference(
                InferenceElement.multiStepPredictions)
            assert inference is None or inference[3] is None
        else:
            prediction = mgr._getInference(
                InferenceElement.multiStepPredictions)[3]
            assert prediction == iterNum + 2
Beispiel #12
0
  def __init__(self, fn, iterations, historyBuffer = None):

    random.seed(42)

    self.nIterations = iterations
    self.fn = fn
    self.iterations = iter(xrange(iterations))
    self.history = historyBuffer

    firstRecord = fn(0)
    fieldList = [(name, self.__getType(val), '')
                      for name, val in firstRecord.iteritems()]

    self.__metaData = FieldMetaInfo.createListFromFileFieldList(fieldList)
Beispiel #13
0
    def testEncoderWithSequenceAndResetFields(self):
        fields = [
            FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none),
            FieldMetaInfo('timestamp', FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp),
            FieldMetaInfo('integer', FieldMetaType.integer,
                          FieldMetaSpecial.none),
            FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none),
            FieldMetaInfo('reset', FieldMetaType.integer,
                          FieldMetaSpecial.reset),
            FieldMetaInfo('sid', FieldMetaType.string,
                          FieldMetaSpecial.sequence),
            FieldMetaInfo('categories', FieldMetaType.list,
                          FieldMetaSpecial.category)
        ]

        encoder = ModelRecordEncoder(fields=fields)

        result = encoder.encode([
            'rec_1',
            datetime(day=1, month=3, year=2010), 5, 6.5, 1, 99, [0, 1, 2]
        ])

        self.assertEqual(
            result, {
                'name': 'rec_1',
                'timestamp': datetime(2010, 3, 1, 0, 0),
                'integer': 5,
                'real': 6.5,
                'reset': 1,
                'sid': 99,
                'categories': [0, 1, 2],
                '_category': [0, 1, 2],
                '_reset': 1,
                '_sequenceId': 99,
                '_timestamp': datetime(2010, 3, 1, 0, 0),
                '_timestampRecordIdx': None
            })
def _testMetricsMgr():
    print "*Testing Metrics Managers*..."
    from nupic.data.fieldmeta import (FieldMetaInfo, FieldMetaType,
                                      FieldMetaSpecial)

    from nupic.frameworks.opf.metrics import MetricSpec
    from nupic.frameworks.opf.opf_utils import ModelResult, SensorInput
    onlineMetrics = (MetricSpec(metric="aae", inferenceElement='', \
                                field="consumption", params={}),)

    print "TESTING METRICS MANAGER (BASIC PLUMBING TEST)..."

    modelFieldMetaInfo = (FieldMetaInfo(name='temperature',
                                        type=FieldMetaType.float,
                                        special=FieldMetaSpecial.none),
                          FieldMetaInfo(name='consumption',
                                        type=FieldMetaType.float,
                                        special=FieldMetaSpecial.none))

    # -----------------------------------------------------------------------
    # Test to make sure that invalid InferenceElements are caught
    try:
        MetricsManager(metricSpecs=onlineMetrics,
                       fieldInfo=modelFieldMetaInfo,
                       inferenceType=InferenceType.TemporalNextStep)
    except ValueError:
        print "Caught bad inference element: PASS"

    print
    onlineMetrics = (MetricSpec(metric="aae",
                                inferenceElement=InferenceElement.prediction,
                                field="consumption",
                                params={}), )

    temporalMetrics = MetricsManager(
        metricSpecs=onlineMetrics,
        fieldInfo=modelFieldMetaInfo,
        inferenceType=InferenceType.TemporalNextStep)

    inputs = [
        {
            'groundTruthRow': [9, 7],
            'predictionsDict': {
                InferenceType.TemporalNextStep: [12, 17]
            }
        },
        {
            'groundTruthRow': [12, 17],
            'predictionsDict': {
                InferenceType.TemporalNextStep: [14, 19]
            }
        },
        {
            'groundTruthRow': [14, 20],
            'predictionsDict': {
                InferenceType.TemporalNextStep: [16, 21]
            }
        },
        {
            'groundTruthRow': [9, 7],
            'predictionsDict': {
                InferenceType.TemporalNextStep: None
            }
        },
    ]

    for element in inputs:
        groundTruthRow = element['groundTruthRow']
        tPredictionRow = element['predictionsDict'][
            InferenceType.TemporalNextStep]

        result = ModelResult(sensorInput=SensorInput(dataRow=groundTruthRow,
                                                     dataEncodings=None,
                                                     sequenceReset=0,
                                                     category=None),
                             inferences={'prediction': tPredictionRow})

        temporalMetrics.update(result)

    assert temporalMetrics.getMetrics().values()[0] == 15.0 / 3.0, \
            "Expected %f, got %f" %(15.0/3.0,
                                    temporalMetrics.getMetrics().values()[0])
    print "ok"

    return
    def testModelSwapper(self):
        """Simple end-to-end test of the model swapper system."""

        modelSchedulerSubprocess = self._startModelSchedulerSubprocess()
        self.addCleanup(lambda: modelSchedulerSubprocess.kill() if
                        modelSchedulerSubprocess.returncode is None else None)

        modelID = "foobar"
        resultBatches = []

        with ModelSwapperInterface() as swapperAPI:
            possibleModels = getScalarMetricWithTimeOfDayParams(metricData=[0],
                                                                minVal=0,
                                                                maxVal=1000)

            # Submit requests including a model creation command and two data rows.
            args = possibleModels[0]
            args["inputRecordSchema"] = (
                FieldMetaInfo("c0", FieldMetaType.datetime,
                              FieldMetaSpecial.timestamp),
                FieldMetaInfo("c1", FieldMetaType.float,
                              FieldMetaSpecial.none),
            )

            # Define the model
            _LOGGER.info("Defining the model")
            swapperAPI.defineModel(modelID=modelID,
                                   args=args,
                                   commandID="defineModelCmd1")

            # Attempt to define the same model again
            _LOGGER.info("Defining the model again")
            swapperAPI.defineModel(modelID=modelID,
                                   args=args,
                                   commandID="defineModelCmd2")

            # Send input rows to the model
            inputRows = [
                ModelInputRow(
                    rowID="rowfoo",
                    data=[datetime.datetime(2013, 5, 23, 8, 13, 00), 5.3]),
                ModelInputRow(
                    rowID="rowbar",
                    data=[datetime.datetime(2013, 5, 23, 8, 13, 15), 2.4]),
            ]
            _LOGGER.info("Submitting batch of %d input rows...",
                         len(inputRows))
            swapperAPI.submitRequests(modelID=modelID, requests=inputRows)

            _LOGGER.info("These models have pending input: %s",
                         swapperAPI.getModelsWithInputPending())

            # Retrieve all results.
            # NOTE: We collect results via background thread to avoid
            # deadlocking the test runner in the event consuming blocks unexpectedly
            _LOGGER.info("Reading all batches of results...")

            numBatchesExpected = 3
            resultBatches.extend(
                self._consumeResults(numBatchesExpected, timeout=20))

            self.assertEqual(len(resultBatches), numBatchesExpected)

            with MessageBusConnector() as bus:
                # The results message queue should be empty now
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))

            # Delete the model
            _LOGGER.info("Deleting the model")
            swapperAPI.deleteModel(modelID=modelID,
                                   commandID="deleteModelCmd1")

            _LOGGER.info("Waiting for model deletion result")
            resultBatches.extend(self._consumeResults(1, timeout=20))

            self.assertEqual(len(resultBatches), 4)

            with MessageBusConnector() as bus:
                # The results message queue should be empty now
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))

                # The model input queue should be deleted now
                self.assertFalse(
                    bus.isMessageQeueuePresent(
                        swapperAPI._getModelInputQName(modelID=modelID)))

            # Try deleting the model again, to make sure there are no exceptions
            _LOGGER.info("Attempting to delete the model again")
            swapperAPI.deleteModel(modelID=modelID,
                                   commandID="deleteModelCmd1")

        # Verify results

        # First result batch should be the first defineModel result
        batch = resultBatches[0]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "defineModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "defineModelCmd1")

        # The second result batch should for the second defineModel result for the
        # same model
        batch = resultBatches[1]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "defineModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "defineModelCmd2")

        # The third batch should be for the two input rows
        batch = resultBatches[2]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), len(inputRows))

        for inputRow, result in zip(inputRows, batch.objects):
            self.assertIsInstance(result, ModelInferenceResult)
            self.assertEqual(result.status, htmengineerrno.SUCCESS)
            self.assertEqual(result.rowID, inputRow.rowID)
            self.assertIsInstance(result.anomalyScore, float)

        # The fourth batch should be for the "deleteModel"
        batch = resultBatches[3]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "deleteModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "deleteModelCmd1")

        # Signal Model Scheduler Service subprocess to shut down and wait for it
        waitResult = dict()

        def runWaiterThread():
            try:
                waitResult["returnCode"] = modelSchedulerSubprocess.wait()
            except:
                _LOGGER.exception(
                    "Waiting for modelSchedulerSubprocess failed")
                waitResult["exceptionInfo"] = traceback.format_exc()
                raise
            return

        modelSchedulerSubprocess.terminate()
        waiterThread = threading.Thread(target=runWaiterThread)
        waiterThread.setDaemon(True)
        waiterThread.start()
        waiterThread.join(timeout=30)
        self.assertFalse(waiterThread.isAlive())

        self.assertEqual(waitResult["returnCode"], 0, msg=repr(waitResult))
Beispiel #16
0
 def testFieldMetaInfoRaisesValueErrorOnInvalidFieldType(self):
     with self.assertRaises(ValueError):
         FieldMetaInfo("fieldName", "bogus-type", FieldMetaSpecial.none)
Beispiel #17
0
    def testBasic(self):
        """Runs basic FileRecordStream tests."""
        filename = _getTempFileName()

        # Write a standard file
        fields = [
            FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none),
            FieldMetaInfo('timestamp', FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp),
            FieldMetaInfo('integer', FieldMetaType.integer,
                          FieldMetaSpecial.none),
            FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none),
            FieldMetaInfo('reset', FieldMetaType.integer,
                          FieldMetaSpecial.reset),
            FieldMetaInfo('sid', FieldMetaType.string,
                          FieldMetaSpecial.sequence),
            FieldMetaInfo('categoryField', FieldMetaType.integer,
                          FieldMetaSpecial.category),
        ]
        fieldNames = [
            'name', 'timestamp', 'integer', 'real', 'reset', 'sid',
            'categoryField'
        ]

        print 'Creating temp file:', filename

        with FileRecordStream(streamID=filename, write=True,
                              fields=fields) as s:

            self.assertEqual(0, s.getDataRowCount())

            # Records
            records = ([
                'rec_1',
                datetime(day=1, month=3, year=2010), 5, 6.5, 1, 'seq-1', 10
            ], [
                'rec_2',
                datetime(day=2, month=3, year=2010), 8, 7.5, 0, 'seq-1', 11
            ], [
                'rec_3',
                datetime(day=3, month=3, year=2010), 12, 8.5, 0, 'seq-1', 12
            ])

            self.assertEqual(fields, s.getFields())
            self.assertEqual(0, s.getNextRecordIdx())

            print 'Writing records ...'
            for r in records:
                print list(r)
                s.appendRecord(list(r))

            self.assertEqual(3, s.getDataRowCount())

            recordsBatch = ([
                'rec_4',
                datetime(day=4, month=3, year=2010), 2, 9.5, 1, 'seq-1', 13
            ], [
                'rec_5',
                datetime(day=5, month=3, year=2010), 6, 10.5, 0, 'seq-1', 14
            ], [
                'rec_6',
                datetime(day=6, month=3, year=2010), 11, 11.5, 0, 'seq-1', 15
            ])

            print 'Adding batch of records...'
            for rec in recordsBatch:
                print rec
            s.appendRecords(recordsBatch)
            self.assertEqual(6, s.getDataRowCount())

        with FileRecordStream(filename) as s:

            # Read the standard file
            self.assertEqual(6, s.getDataRowCount())
            self.assertEqual(fieldNames, s.getFieldNames())

            # Note! this is the number of records read so far
            self.assertEqual(0, s.getNextRecordIdx())

            readStats = s.getStats()
            print 'Got stats:', readStats
            expectedStats = {
                'max': [None, None, 12, 11.5, 1, None, 15],
                'min': [None, None, 2, 6.5, 0, None, 10]
            }
            self.assertEqual(expectedStats, readStats)

            readRecords = []
            print 'Reading records ...'
            while True:
                r = s.getNextRecord()
                print r
                if r is None:
                    break

                readRecords.append(r)

            allRecords = records + recordsBatch
            for r1, r2 in zip(allRecords, readRecords):
                self.assertEqual(r1, r2)
Beispiel #18
0
    def __init__(self,
                 streamID,
                 write=False,
                 fields=None,
                 missingValues=None,
                 bookmark=None,
                 includeMS=True,
                 firstRecord=None):
        """ Constructor
    
    streamID:
        CSV file name, input or output
    write:
        True or False, open for writing if True
    fields:
        a list of nupic.data.fieldmeta.FieldMetaInfo field descriptors, only
        applicable when write==True
    missingValues:
        what missing values should be replaced with?
    bookmark:
        a reference to the previous reader, if passed in, the records will be
        returned starting from the point where bookmark was requested. Either 
        bookmark or firstRecord can be specified, not both. If bookmark is used, 
        then firstRecord MUST be None. 
    includeMS:
        If false, the microseconds portion is not included in the
        generated output file timestamp fields. This makes it compatible
        with reading in from Excel.
    firstRecord: 
        0-based index of the first record to start reading from. Either bookmark
        or firstRecord can be specified, not both. If bookmark is used, then
        firstRecord MUST be None. 

    Each field is a 3-tuple (name, type, special or '')

    The name is the name of the field. The type is one of: 'string', 'datetime',
    'int', 'float', 'bool' The special is either empty or one of S, R, T, C that
    designate their field as the sequenceId, reset, timestamp, or category.
    There can be at most one of each. There may be multiple fields of type
    datetime, but no more than one of them may be the timestamp field (T). The
    sequence id field must be either a string or an int. The reset field must be
    an int (and must contain 0 or 1). The category field must be an int.

    The FileRecordStream iterates over the field names, types and specials and
    stores the information.
    """

        # Call superclass constructor
        super(FileRecordStream, self).__init__()

        # Only bookmark or firstRow can be specified, not both
        if bookmark is not None and firstRecord is not None:
            raise RuntimeError(
                "Only bookmark or firstRecord can be specified, not both")

        if fields is None:
            fields = []
        if missingValues is None:
            missingValues = ['']

        # We'll be operating on csvs with arbitrarily long fields
        size = 2**27
        csv.field_size_limit(size)

        self._filename = streamID
        # We can't guarantee what system files are coming from, use universal
        # newlines
        self._write = write
        self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE
        self._file = open(self._filename, self._mode)
        self._sequences = set()
        self.rewindAtEOF = False

        if write:
            assert fields is not None
            assert isinstance(fields, (tuple, list))
            # Verify all fields are 3-tuple
            assert all(
                isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3
                for f in fields)
            names, types, specials = zip(*fields)
            self._writer = csv.writer(self._file)
        else:
            os.linesep = '\n'  # make sure readline() works on windows too.
            # Read header lines
            self._reader = csv.reader(self._file,
                                      dialect='excel',
                                      quoting=csv.QUOTE_NONE)
            try:
                names = [n.strip() for n in self._reader.next()]
            except:
                raise Exception('The header line of the file %s contained a NULL byte' \
                                % self._filename)
            types = [t.strip() for t in self._reader.next()]
            specials = [s.strip() for s in self._reader.next()]

            # If there are no specials, this means there was a blank line
            if len(specials) == 0:
                specials = [""]

        if not (len(names) == len(types) == len(specials)):
            raise Exception('Invalid file format: different number of fields '
                            'in the header rows of file %s (%d, %d, %d)' %
                            (streamID, len(names), len(types), len(specials)))

        # Verify standard file format
        allowedTypes = ('string', 'datetime', 'int', 'float', 'bool', 'sdr')
        for i, t in enumerate(types):
            # This is a temporary hack for the Precog milestone, which passes in a
            # type 'address' for address fields. Here we simply map the type "address"
            # to "string".
            if t == 'address':
                types[i] = 'string'
                t = 'string'

            if t not in allowedTypes:
                raise Exception(
                    'Invalid file format for "%s" - field type "%s" '
                    'not one of %s ' % (self._filename, t, allowedTypes))

        for s in specials:
            if s not in ('', 'T', 'R', 'S', 'C', 'L'):
                raise Exception(
                    'Invalid file format. \'%s\' is not a valid special '
                    'flag' % s)

        self._fields = [
            FieldMetaInfo(*attrs) for attrs in zip(names, types, specials)
        ]
        self._fieldCount = len(self._fields)

        # Keep track on how many records have been read/written
        self._recordCount = 0

        self._timeStampIdx = specials.index('T') if 'T' in specials else None
        self._resetIdx = specials.index('R') if 'R' in specials else None
        self._sequenceIdIdx = specials.index('S') if 'S' in specials else None
        self._categoryIdx = specials.index('C') if 'C' in specials else None
        self._learningIdx = specials.index('L') if 'L' in specials else None

        # keep track of the current sequence
        self._currSequence = None
        self._currTime = None

        if self._timeStampIdx:
            assert types[self._timeStampIdx] == 'datetime'
        if self._sequenceIdIdx:
            assert types[self._sequenceIdIdx] in ('string', 'int')
        if self._resetIdx:
            assert types[self._resetIdx] == 'int'
        if self._categoryIdx:
            assert types[self._categoryIdx] == 'int'
        if self._learningIdx:
            assert types[self._learningIdx] == 'int'

        # Convert the types to the actual types in order to convert the strings
        if self._mode == self._FILE_READ_MODE:
            m = dict(int=intOrNone,
                     float=floatOrNone,
                     bool=parseBool,
                     string=unescape,
                     datetime=parseTimestamp,
                     sdr=parseSdr)
        else:
            if includeMS:
                datetimeFunc = serializeTimestamp
            else:
                datetimeFunc = serializeTimestampNoMS
            m = dict(int=str,
                     float=str,
                     string=escape,
                     bool=str,
                     datetime=datetimeFunc,
                     sdr=serializeSdr)

        self._adapters = [m[t] for t in types]

        self._missingValues = missingValues

        #
        # If the bookmark is set, we need to skip over first N records
        #
        if bookmark is not None:
            rowsToSkip = self._getStartRow(bookmark)
        elif firstRecord is not None:
            rowsToSkip = firstRecord
        else:
            rowsToSkip = 0

        while rowsToSkip > 0:
            self.next()
            rowsToSkip -= 1

        # Dictionary to store record statistics (min and max of scalars for now)
        self._stats = None
Beispiel #19
0
    def __init__(self,
                 streamDef,
                 bookmark=None,
                 saveOutput=False,
                 isBlocking=True,
                 maxTimeout=0,
                 eofOnTimeout=False):
        """ Base class constructor, performs common initialization

    Parameters:
    ----------------------------------------------------------------
    streamDef:  The stream definition, potentially containing multiple sources
                (not supported yet). See
                /nupic/frameworks/opf/jsonschema/stream_def.json for the format
                of this dict

    bookmark: Bookmark to start reading from. This overrides the first_record
                field of the streamDef if provided.

    saveOutput: If true, save the output to a csv file in a temp directory.
                The path to the generated file can be found in the log
                output.

    isBlocking: should read operation block *forever* if the next row of data
                is not available, but the stream is not marked as 'completed'
                yet?

    maxTimeout: if isBlocking is False, max seconds to wait for more data before
                timing out; ignored when isBlocking is True.

    eofOnTimeout: If True and we get a read timeout (isBlocking must be False
                to get read timeouts), assume we've reached the end of the
                input and produce the last aggregated record, if one can be
                completed.

    """

        # Call superclass constructor
        super(StreamReader, self).__init__()

        loggerPrefix = 'com.numenta.nupic.data.StreamReader'
        self._logger = logging.getLogger(loggerPrefix)
        jsonhelpers.validate(streamDef,
                             schemaPath=pkg_resources.resource_filename(
                                 jsonschema.__name__, "stream_def.json"))
        assert len(
            streamDef['streams']) == 1, "Only 1 source stream is supported"

        # Save constructor args
        sourceDict = streamDef['streams'][0]
        self._recordCount = 0
        self._eofOnTimeout = eofOnTimeout
        self._logger.debug('Reading stream with the def: %s', sourceDict)

        # Dictionary to store record statistics (min and max of scalars for now)
        self._stats = None

        # ---------------------------------------------------------------------
        # Get the stream definition params

        # Limiting window of the stream. It would not return any records until
        # 'first_record' ID is read (or very first with the ID above that). The
        # stream will return EOS once it reads record with ID 'last_record' or
        # above (NOTE: the name 'lastRecord' is misleading because it is NOT
        #  inclusive).
        firstRecordIdx = sourceDict.get('first_record', None)
        self._sourceLastRecordIdx = sourceDict.get('last_record', None)

        # If a bookmark was given, then override first_record from the stream
        #  definition.
        if bookmark is not None:
            firstRecordIdx = None

        # Column names must be provided in the streamdef json
        # Special case is ['*'], meaning all available names from the record stream
        self._streamFieldNames = sourceDict.get('columns', None)
        if self._streamFieldNames != None and self._streamFieldNames[0] == '*':
            self._needFieldsFiltering = False
        else:
            self._needFieldsFiltering = True

        # Types must be specified in streamdef json, or in case of the
        #  file_recod_stream types could be implicit from the file
        streamFieldTypes = sourceDict.get('types', None)
        self._logger.debug('Types from the def: %s', streamFieldTypes)
        # Validate that all types are valid
        if streamFieldTypes is not None:
            for dataType in streamFieldTypes:
                assert FieldMetaType.isValid(dataType)

        # Reset, sequence and time fields might be provided by streamdef json
        streamResetFieldName = streamDef.get('resetField', None)
        streamTimeFieldName = streamDef.get('timeField', None)
        streamSequenceFieldName = streamDef.get('sequenceIdField', None)
        self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName,
                           streamTimeFieldName, streamSequenceFieldName)

        # =======================================================================
        # Open up the underlying record store
        dataUrl = sourceDict.get('source', None)
        assert dataUrl is not None
        self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout,
                                             bookmark, firstRecordIdx)
        assert self._recordStore is not None

        # =======================================================================
        # Prepare the data structures we need for returning just the fields
        #  the caller wants from each record
        recordStoreFields = self._recordStore.getFields()
        self._recordStoreFieldNames = self._recordStore.getFieldNames()

        if not self._needFieldsFiltering:
            self._streamFieldNames = self._recordStoreFieldNames

        # Build up the field definitions for each field. This is a list of tuples
        #  of (name, type, special)
        self._streamFields = []
        for dstIdx, name in enumerate(self._streamFieldNames):
            if name not in self._recordStoreFieldNames:
                raise RuntimeError(
                    "The column '%s' from the stream definition "
                    "is not present in the underlying stream which has the following "
                    "columns: %s" % (name, self._recordStoreFieldNames))

            fieldIdx = self._recordStoreFieldNames.index(name)
            fieldType = recordStoreFields[fieldIdx].type
            fieldSpecial = recordStoreFields[fieldIdx].special

            # If the types or specials were defined in the stream definition,
            #   then override what was found in the record store
            if streamFieldTypes is not None:
                fieldType = streamFieldTypes[dstIdx]

            if streamResetFieldName is not None and streamResetFieldName == name:
                fieldSpecial = FieldMetaSpecial.reset
            if streamTimeFieldName is not None and streamTimeFieldName == name:
                fieldSpecial = FieldMetaSpecial.timestamp
            if (streamSequenceFieldName is not None
                    and streamSequenceFieldName == name):
                fieldSpecial = FieldMetaSpecial.sequence

            self._streamFields.append(
                FieldMetaInfo(name, fieldType, fieldSpecial))

        # ========================================================================
        # Create the aggregator which will handle aggregation of records before
        #  returning them.
        self._aggregator = Aggregator(
            aggregationInfo=streamDef.get('aggregation', None),
            inputFields=recordStoreFields,
            timeFieldName=streamDef.get('timeField', None),
            sequenceIdFieldName=streamDef.get('sequenceIdField', None),
            resetFieldName=streamDef.get('resetField', None))

        # We rely on the aggregator to tell us the bookmark of the last raw input
        #  that contributed to the aggregated record
        self._aggBookmark = None

        # Compute the aggregation period in terms of months and seconds
        if 'aggregation' in streamDef:
            self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds(
                streamDef.get('aggregation'))
        else:
            self._aggMonthsAndSeconds = None

        # ========================================================================
        # Are we saving the generated output to a csv?
        if saveOutput:
            tmpDir = tempfile.mkdtemp()
            outFilename = os.path.join(tmpDir, "generated_output.csv")
            self._logger.info(
                "StreamReader: Saving generated records to: '%s'" %
                outFilename)
            self._writer = FileRecordStream(streamID=outFilename,
                                            write=True,
                                            fields=self._streamFields)
        else:
            self._writer = None
    def testCloneModel(self):

        modelSchedulerSubprocess = self._startModelSchedulerSubprocess()
        self.addCleanup(lambda: modelSchedulerSubprocess.kill() if
                        modelSchedulerSubprocess.returncode is None else None)

        modelID = "abc"
        destModelID = "def"

        resultBatches = []

        with ModelSwapperInterface() as swapperAPI:
            args = getScalarMetricWithTimeOfDayAnomalyParams(metricData=[0],
                                                             minVal=0,
                                                             maxVal=1000)

            # Submit requests including a model creation command and two data rows.
            args["inputRecordSchema"] = (
                FieldMetaInfo("c0", FieldMetaType.datetime,
                              FieldMetaSpecial.timestamp),
                FieldMetaInfo("c1", FieldMetaType.float,
                              FieldMetaSpecial.none),
            )

            # Define the model
            _LOGGER.info("Defining the model")
            swapperAPI.defineModel(modelID=modelID,
                                   args=args,
                                   commandID="defineModelCmd1")

            resultBatches.extend(self._consumeResults(1, timeout=20))
            self.assertEqual(len(resultBatches), 1)

            # Clone the just-defined model
            _LOGGER.info("Cloning model")
            swapperAPI.cloneModel(modelID,
                                  destModelID,
                                  commandID="cloneModelCmd1")

            resultBatches.extend(self._consumeResults(1, timeout=20))
            self.assertEqual(len(resultBatches), 2)

            # Send input rows to the clone
            inputRows = [
                ModelInputRow(
                    rowID="rowfoo",
                    data=[datetime.datetime(2013, 5, 23, 8, 13, 00), 5.3]),
                ModelInputRow(
                    rowID="rowbar",
                    data=[datetime.datetime(2013, 5, 23, 8, 13, 15), 2.4]),
            ]
            _LOGGER.info("Submitting batch of %d input rows...",
                         len(inputRows))
            swapperAPI.submitRequests(modelID=destModelID, requests=inputRows)

            _LOGGER.info("These models have pending input: %s",
                         swapperAPI.getModelsWithInputPending())

            resultBatches.extend(self._consumeResults(1, timeout=20))
            self.assertEqual(len(resultBatches), 3)

            with MessageBusConnector() as bus:
                # The results message queue should be empty now
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))

            # Delete the model
            _LOGGER.info("Deleting the model")
            swapperAPI.deleteModel(modelID=destModelID,
                                   commandID="deleteModelCmd1")

            _LOGGER.info("Waiting for model deletion result")
            resultBatches.extend(self._consumeResults(1, timeout=20))

            self.assertEqual(len(resultBatches), 4)

            with MessageBusConnector() as bus:
                # The results message queue should be empty now
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))

                # The model input queue should be deleted now
                self.assertFalse(
                    bus.isMessageQeueuePresent(
                        swapperAPI._getModelInputQName(modelID=destModelID)))

        # Verify results

        # First result batch should be the defineModel result
        batch = resultBatches[0]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "defineModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "defineModelCmd1")

        # The second result batch should for the cloneModel result
        batch = resultBatches[1]
        self.assertEqual(batch.modelID, modelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "cloneModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "cloneModelCmd1")

        # The third batch should be for the two input rows
        batch = resultBatches[2]
        self.assertEqual(batch.modelID, destModelID)
        self.assertEqual(len(batch.objects), len(inputRows))

        for inputRow, result in zip(inputRows, batch.objects):
            self.assertIsInstance(result, ModelInferenceResult)
            self.assertEqual(result.status, htmengineerrno.SUCCESS)
            self.assertEqual(result.rowID, inputRow.rowID)
            self.assertIsInstance(result.anomalyScore, float)
            self.assertIsInstance(result.multiStepBestPredictions, dict)

        # The fourth batch should be for the "deleteModel"
        batch = resultBatches[3]
        self.assertEqual(batch.modelID, destModelID)
        self.assertEqual(len(batch.objects), 1)

        result = batch.objects[0]
        self.assertIsInstance(result, ModelCommandResult)
        self.assertEqual(result.method, "deleteModel")
        self.assertEqual(result.status, htmengineerrno.SUCCESS)
        self.assertEqual(result.commandID, "deleteModelCmd1")

        # Signal Model Scheduler Service subprocess to shut down and wait for it
        waitResult = dict()

        def runWaiterThread():
            try:
                waitResult["returnCode"] = modelSchedulerSubprocess.wait()
            except:
                _LOGGER.exception(
                    "Waiting for modelSchedulerSubprocess failed")
                waitResult["exceptionInfo"] = traceback.format_exc()
                raise
            return

        modelSchedulerSubprocess.terminate()
        waiterThread = threading.Thread(target=runWaiterThread)
        waiterThread.setDaemon(True)
        waiterThread.start()
        waiterThread.join(timeout=30)
        self.assertFalse(waiterThread.isAlive())

        self.assertEqual(waitResult["returnCode"], 0, msg=repr(waitResult))
    def _auxTestRunModelWithFullThenIncrementalCheckpoints(
            self, classifierEnabled):
        modelID = "foobar"
        checkpointMgr = model_checkpoint_mgr.ModelCheckpointMgr()

        args = getScalarMetricWithTimeOfDayAnomalyParams(metricData=[0],
                                                         minVal=0,
                                                         maxVal=1000)

        args["modelConfig"]["modelParams"]["clEnable"] = classifierEnabled

        # Submit requests including a model creation command and two data rows.
        args["inputRecordSchema"] = (
            FieldMetaInfo("c0", FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp),
            FieldMetaInfo("c1", FieldMetaType.float, FieldMetaSpecial.none),
        )

        with ModelSwapperInterface() as swapperAPI:
            # Define the model
            _LOGGER.info("Defining the model")
            swapperAPI.defineModel(modelID=modelID,
                                   args=args,
                                   commandID="defineModelCmd1")
            # Send input rows to the model
            inputRows = [
                ModelInputRow(
                    rowID="rowfoo",
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 00), 5.3]),
                ModelInputRow(
                    rowID="rowbar",
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 15), 2.4]),
            ]
            _LOGGER.info(
                "Submitting batch of %d input rows with ids=[%s..%s]...",
                len(inputRows), inputRows[0].rowID, inputRows[-1].rowID)
            swapperAPI.submitRequests(modelID=modelID, requests=inputRows)
            # Run model_runner and collect results
            with self._startModelRunnerSubprocess(
                    modelID) as modelRunnerProcess:
                resultBatches = self._consumeResults(numExpectedBatches=2,
                                                     timeout=15)
                self._waitForProcessToStopAndCheck(modelRunnerProcess)
            with MessageBusConnector() as bus:
                # The results message queue should be empty now
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))
            self.assertEqual(len(resultBatches), 2, repr(resultBatches))
            # First result batch should be the first defineModel result
            batch = resultBatches[0]
            self.assertEqual(batch.modelID, modelID)
            self.assertEqual(len(batch.objects), 1)
            result = batch.objects[0]
            self.assertIsInstance(result, ModelCommandResult)
            self.assertEqual(result.method, "defineModel")
            self.assertEqual(result.status, htmengineerrno.SUCCESS)
            self.assertEqual(result.commandID, "defineModelCmd1")
            # The second result batch should be for the two input rows
            batch = resultBatches[1]
            self.assertEqual(batch.modelID, modelID)
            self.assertEqual(len(batch.objects), len(inputRows))
            for inputRow, result in zip(inputRows, batch.objects):
                self.assertIsInstance(result, ModelInferenceResult)
                self.assertEqual(result.status, htmengineerrno.SUCCESS)
                self.assertEqual(result.rowID, inputRow.rowID)
                self.assertIsInstance(result.anomalyScore, float)
                if classifierEnabled:
                    self.assertIsInstance(result.multiStepBestPredictions,
                                          dict)
                else:
                    self.assertIsNone(result.multiStepBestPredictions)

            # Verify model checkpoint
            model = checkpointMgr.load(modelID)
            del model
            attrs = checkpointMgr.loadCheckpointAttributes(modelID)
            self.assertIn(
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME,
                attrs,
                msg=repr(attrs))
            self.assertEqual(len(attrs[
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME]),
                             2,
                             msg=repr(attrs))
            self.assertNotIn(model_runner._ModelArchiver.
                             _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME,
                             attrs,
                             msg=repr(attrs))
            # Now, check incremental checkpointing
            inputRows2 = [
                ModelInputRow(
                    rowID=2,
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 20), 2.7]),
                ModelInputRow(
                    rowID=3,
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 25), 3.9]),
            ]
            _LOGGER.info(
                "Submitting batch of %d input rows with ids=[%s..%s]...",
                len(inputRows2), inputRows2[0].rowID, inputRows2[-1].rowID)
            inputBatchID = swapperAPI.submitRequests(modelID=modelID,
                                                     requests=inputRows2)
            with self._startModelRunnerSubprocess(
                    modelID) as modelRunnerProcess:
                resultBatches = self._consumeResults(numExpectedBatches=1,
                                                     timeout=15)
                self._waitForProcessToStopAndCheck(modelRunnerProcess)
            with MessageBusConnector() as bus:
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))
            batch = resultBatches[0]
            self.assertEqual(batch.modelID, modelID)
            self.assertEqual(len(batch.objects), len(inputRows2))
            for inputRow, result in zip(inputRows2, batch.objects):
                self.assertIsInstance(result, ModelInferenceResult)
                self.assertEqual(result.status, htmengineerrno.SUCCESS)
                self.assertEqual(result.rowID, inputRow.rowID)
                self.assertIsInstance(result.anomalyScore, float)
                if classifierEnabled:
                    self.assertIsInstance(result.multiStepBestPredictions,
                                          dict)
                else:
                    self.assertIsNone(result.multiStepBestPredictions)

            model = checkpointMgr.load(modelID)
            del model
            attrs = checkpointMgr.loadCheckpointAttributes(modelID)
            self.assertIn(
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME,
                attrs,
                msg=repr(attrs))
            self.assertSequenceEqual(attrs[
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME],
                                     [inputBatchID],
                                     msg=repr(attrs))
            self.assertIn(model_runner._ModelArchiver.
                          _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME,
                          attrs,
                          msg=repr(attrs))
            self.assertSequenceEqual(
                model_runner._ModelArchiver._decodeDataSamples(
                    attrs[model_runner._ModelArchiver.
                          _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME]),
                [row.data for row in inputRows2],
                msg=repr(attrs))
            # Final run with incremental checkpointing
            inputRows3 = [
                ModelInputRow(
                    rowID=4,
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 30), 4.7]),
                ModelInputRow(
                    rowID=5,
                    data=[datetime.datetime(2014, 5, 23, 8, 13, 35), 5.9]),
            ]
            _LOGGER.info(
                "Submitting batch of %d input rows with ids=[%s..%s]...",
                len(inputRows3), inputRows3[0].rowID, inputRows3[-1].rowID)
            inputBatchID = swapperAPI.submitRequests(modelID=modelID,
                                                     requests=inputRows3)
            with self._startModelRunnerSubprocess(
                    modelID) as modelRunnerProcess:
                resultBatches = self._consumeResults(numExpectedBatches=1,
                                                     timeout=15)
                self._waitForProcessToStopAndCheck(modelRunnerProcess)
            with MessageBusConnector() as bus:
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))
            batch = resultBatches[0]
            self.assertEqual(batch.modelID, modelID)
            self.assertEqual(len(batch.objects), len(inputRows3))
            for inputRow, result in zip(inputRows3, batch.objects):
                self.assertIsInstance(result, ModelInferenceResult)
                self.assertEqual(result.status, htmengineerrno.SUCCESS)
                self.assertEqual(result.rowID, inputRow.rowID)
                self.assertIsInstance(result.anomalyScore, float)
                if classifierEnabled:
                    self.assertIsInstance(result.multiStepBestPredictions,
                                          dict)
                else:
                    self.assertIsNone(result.multiStepBestPredictions)

            model = checkpointMgr.load(modelID)
            del model
            attrs = checkpointMgr.loadCheckpointAttributes(modelID)
            self.assertIn(
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME,
                attrs,
                msg=repr(attrs))
            self.assertSequenceEqual(attrs[
                model_runner._ModelArchiver._BATCH_IDS_CHECKPOINT_ATTR_NAME],
                                     [inputBatchID],
                                     msg=repr(attrs))
            self.assertIn(model_runner._ModelArchiver.
                          _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME,
                          attrs,
                          msg=repr(attrs))
            self.assertSequenceEqual(
                model_runner._ModelArchiver._decodeDataSamples(
                    attrs[model_runner._ModelArchiver.
                          _INPUT_SAMPLES_SINCE_CHECKPOINT_ATTR_NAME]),
                [row.data for row in itertools.chain(inputRows2, inputRows3)],
                msg=repr(attrs))
            # Delete the model
            _LOGGER.info("Deleting the model=%s", modelID)
            swapperAPI.deleteModel(modelID=modelID,
                                   commandID="deleteModelCmd1")
            with self._startModelRunnerSubprocess(
                    modelID) as modelRunnerProcess:
                resultBatches = self._consumeResults(numExpectedBatches=1,
                                                     timeout=15)
                self._waitForProcessToStopAndCheck(modelRunnerProcess)
            self.assertEqual(len(resultBatches), 1, repr(resultBatches))
            # First result batch should be the first defineModel result
            batch = resultBatches[0]
            self.assertEqual(batch.modelID, modelID)
            self.assertEqual(len(batch.objects), 1)
            result = batch.objects[0]
            self.assertIsInstance(result, ModelCommandResult)
            self.assertEqual(result.method, "deleteModel")
            self.assertEqual(result.status, htmengineerrno.SUCCESS)
            self.assertEqual(result.commandID, "deleteModelCmd1")
            with MessageBusConnector() as bus:
                self.assertTrue(bus.isEmpty(swapperAPI._resultsQueueName))

                # The model input queue should be deleted now
                self.assertFalse(
                    bus.isMessageQeueuePresent(
                        swapperAPI._getModelInputQName(modelID=modelID)))

            # The model checkpoint should be gone too
            with self.assertRaises(model_checkpoint_mgr.ModelNotFound):
                checkpointMgr.load(modelID)
            with self.assertRaises(model_checkpoint_mgr.ModelNotFound):
                checkpointMgr.loadModelDefinition(modelID)
            with self.assertRaises(model_checkpoint_mgr.ModelNotFound):
                checkpointMgr.loadCheckpointAttributes(modelID)
            with self.assertRaises(model_checkpoint_mgr.ModelNotFound):
                checkpointMgr.remove(modelID)
  def __init__(self, streamID, write=False, fields=None, missingValues=None,
               bookmark=None, includeMS=True, firstRecord=None):
    super(FileRecordStream, self).__init__()

    # Only bookmark or firstRow can be specified, not both
    if bookmark is not None and firstRecord is not None:
      raise RuntimeError(
          "Only bookmark or firstRecord can be specified, not both")

    if fields is None:
      fields = []
    if missingValues is None:
      missingValues = ['']

    # We'll be operating on csvs with arbitrarily long fields
    size = 2**27
    csv.field_size_limit(size)

    self._filename = streamID
    # We can't guarantee what system files are coming from, use universal
    # newlines
    self._write = write
    self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE
    self._file = open(self._filename, self._mode)
    self._sequences = set()
    self.rewindAtEOF = False

    if write:
      assert fields is not None
      assert isinstance(fields, (tuple, list))
      # Verify all fields are 3-tuple
      assert all(isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3
                 for f in fields)
      names, types, specials = zip(*fields)
      self._writer = csv.writer(self._file)
    else:
      # Read header lines
      self._reader = csv.reader(self._file, dialect="excel")
      try:
        names = [n.strip() for n in self._reader.next()]
      except:
        raise Exception('The header line of the file %s contained a NULL byte' \
                        % self._filename)
      types = [t.strip() for t in self._reader.next()]
      specials = [s.strip() for s in self._reader.next()]

      # If there are no specials, this means there was a blank line
      if len(specials) == 0:
        specials=[""]

    if not len(names) == len(types) == len(specials):
      raise Exception('Invalid file format: different number of fields '
                      'in the header rows of file %s (%d, %d, %d)' %
                      (streamID, len(names), len(types), len(specials)))

    # Verify standard file format
    for t in types:
      if not FieldMetaType.isValid(t):
        raise Exception('Invalid file format for "%s" - field type "%s" '
                        'not a valid FieldMetaType' % (self._filename, t,))

    for s in specials:
      if not FieldMetaSpecial.isValid(s):
        raise Exception('Invalid file format. \'%s\' is not a valid special '
                        'flag' % s)

    self._fields = [FieldMetaInfo(*attrs)
                    for attrs in zip(names, types, specials)]
    self._fieldCount = len(self._fields)

    # Keep track on how many records have been read/written
    self._recordCount = 0

    self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp)
                          if FieldMetaSpecial.timestamp in specials else None)
    self._resetIdx = (specials.index(FieldMetaSpecial.reset)
                      if FieldMetaSpecial.reset in specials else None)
    self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence)
                           if FieldMetaSpecial.sequence in specials else None)
    self._categoryIdx = (specials.index(FieldMetaSpecial.category)
                         if FieldMetaSpecial.category in specials else None)
    self._learningIdx = (specials.index(FieldMetaSpecial.learning)
                         if FieldMetaSpecial.learning in specials else None)

    # keep track of the current sequence
    self._currSequence = None
    self._currTime = None

    if self._timeStampIdx:
      assert types[self._timeStampIdx] == FieldMetaType.datetime
    if self._sequenceIdIdx:
      assert types[self._sequenceIdIdx] in (FieldMetaType.string,
                                            FieldMetaType.integer)
    if self._resetIdx:
      assert types[self._resetIdx] == FieldMetaType.integer
    if self._categoryIdx:
      assert types[self._categoryIdx] in (FieldMetaType.list,
                                          FieldMetaType.integer)
    if self._learningIdx:
      assert types[self._learningIdx] == FieldMetaType.integer

    # Convert the types to the actual types in order to convert the strings
    if self._mode == self._FILE_READ_MODE:
      m = {FieldMetaType.integer: intOrNone,
           FieldMetaType.float: floatOrNone,
           FieldMetaType.boolean: parseBool,
           FieldMetaType.string: unescape,
           FieldMetaType.datetime: parseTimestamp,
           FieldMetaType.sdr: parseSdr,
           FieldMetaType.list: parseStringList}
    else:
      if includeMS:
        datetimeFunc = serializeTimestamp
      else:
        datetimeFunc = serializeTimestampNoMS
      m = {FieldMetaType.integer: str,
           FieldMetaType.float: str,
           FieldMetaType.string: escape,
           FieldMetaType.boolean: str,
           FieldMetaType.datetime: datetimeFunc,
           FieldMetaType.sdr: serializeSdr,
           FieldMetaType.list: stripList}

    self._adapters = [m[t] for t in types]

    self._missingValues = missingValues

    #
    # If the bookmark is set, we need to skip over first N records
    #
    if bookmark is not None:
      rowsToSkip = self._getStartRow(bookmark)
    elif firstRecord is not None:
      rowsToSkip = firstRecord
    else:
      rowsToSkip = 0

    while rowsToSkip > 0:
      self.next()
      rowsToSkip -= 1


    # Dictionary to store record statistics (min and max of scalars for now)
    self._stats = None
Beispiel #23
0
    def testEncoderWithSequenceFieldWithoutResetField(self):
        fields = [
            FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none),
            FieldMetaInfo('timestamp', FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp),
            FieldMetaInfo('integer', FieldMetaType.integer,
                          FieldMetaSpecial.none),
            FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none),
            FieldMetaInfo('sid', FieldMetaType.string,
                          FieldMetaSpecial.sequence),
            FieldMetaInfo('categories', FieldMetaType.list,
                          FieldMetaSpecial.category)
        ]

        encoder = ModelRecordEncoder(fields=fields)

        # _reset should be 1 the first time
        result = encoder.encode([
            'rec_1',
            datetime(day=1, month=3, year=2010), 5, 6.5, 99, [0, 1, 2]
        ])

        self.assertEqual(
            result, {
                'name': 'rec_1',
                'timestamp': datetime(2010, 3, 1, 0, 0),
                'integer': 5,
                'real': 6.5,
                'sid': 99,
                'categories': [0, 1, 2],
                '_category': [0, 1, 2],
                '_reset': 1,
                '_sequenceId': 99,
                '_timestamp': datetime(2010, 3, 1, 0, 0),
                '_timestampRecordIdx': None
            })

        # _reset should be 0 when same sequence id is repeated
        result = encoder.encode([
            'rec_2',
            datetime(day=2, month=3, year=2010), 5, 6.5, 99, [0, 1, 2]
        ])

        self.assertEqual(
            result, {
                'name': 'rec_2',
                'timestamp': datetime(2010, 3, 2, 0, 0),
                'integer': 5,
                'real': 6.5,
                'sid': 99,
                'categories': [0, 1, 2],
                '_category': [0, 1, 2],
                '_reset': 0,
                '_sequenceId': 99,
                '_timestamp': datetime(2010, 3, 2, 0, 0),
                '_timestampRecordIdx': None
            })

        # _reset should be 1 when sequence id changes
        result = encoder.encode([
            'rec_3',
            datetime(day=2, month=3, year=2010), 5, 6.5, 100, [0, 1, 2]
        ])

        self.assertEqual(
            result, {
                'name': 'rec_3',
                'timestamp': datetime(2010, 3, 2, 0, 0),
                'integer': 5,
                'real': 6.5,
                'sid': 100,
                'categories': [0, 1, 2],
                '_category': [0, 1, 2],
                '_reset': 1,
                '_sequenceId': 100,
                '_timestamp': datetime(2010, 3, 2, 0, 0),
                '_timestampRecordIdx': None
            })
Beispiel #24
0
    def testEncoderWithResetFieldWithoutSequenceField(self):
        fields = [
            FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none),
            FieldMetaInfo('timestamp', FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp),
            FieldMetaInfo('integer', FieldMetaType.integer,
                          FieldMetaSpecial.none),
            FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none),
            FieldMetaInfo('reset', FieldMetaType.integer,
                          FieldMetaSpecial.reset),
            FieldMetaInfo('categories', FieldMetaType.list,
                          FieldMetaSpecial.category)
        ]

        encoder = ModelRecordEncoder(fields=fields)

        result = encoder.encode([
            'rec_1',
            datetime(day=1, month=3, year=2010), 5, 6.5, 1, [0, 1, 2]
        ])

        self.assertEqual(
            result, {
                'name': 'rec_1',
                'timestamp': datetime(2010, 3, 1, 0, 0),
                'integer': 5,
                'real': 6.5,
                'reset': 1,
                'categories': [0, 1, 2],
                '_category': [0, 1, 2],
                '_reset': 1,
                '_sequenceId': 0,
                '_timestamp': datetime(2010, 3, 1, 0, 0),
                '_timestampRecordIdx': None
            })

        # One more time to verify incremeting sequence id
        result = encoder.encode([
            'rec_2',
            datetime(day=2, month=3, year=2010), 5, 6.5, 1, [0, 1, 2]
        ])

        self.assertEqual(
            result, {
                'name': 'rec_2',
                'timestamp': datetime(2010, 3, 2, 0, 0),
                'integer': 5,
                'real': 6.5,
                'reset': 1,
                'categories': [0, 1, 2],
                '_category': [0, 1, 2],
                '_reset': 1,
                '_sequenceId': 1,
                '_timestamp': datetime(2010, 3, 2, 0, 0),
                '_timestampRecordIdx': None
            })

        # Now with reset turned off, expecting no change to sequence id
        result = encoder.encode([
            'rec_3',
            datetime(day=3, month=3, year=2010), 5, 6.5, 0, [0, 1, 2]
        ])

        self.assertEqual(
            result, {
                'name': 'rec_3',
                'timestamp': datetime(2010, 3, 3, 0, 0),
                'integer': 5,
                'real': 6.5,
                'reset': 0,
                'categories': [0, 1, 2],
                '_category': [0, 1, 2],
                '_reset': 0,
                '_sequenceId': 1,
                '_timestamp': datetime(2010, 3, 3, 0, 0),
                '_timestampRecordIdx': None
            })

        # Now check that rewind resets sequence id
        encoder.rewind()
        result = encoder.encode([
            'rec_4',
            datetime(day=4, month=3, year=2010), 5, 6.5, 1, [0, 1, 2]
        ])

        self.assertEqual(
            result, {
                'name': 'rec_4',
                'timestamp': datetime(2010, 3, 4, 0, 0),
                'integer': 5,
                'real': 6.5,
                'reset': 1,
                'categories': [0, 1, 2],
                '_category': [0, 1, 2],
                '_reset': 1,
                '_sequenceId': 0,
                '_timestamp': datetime(2010, 3, 4, 0, 0),
                '_timestampRecordIdx': None
            })
Beispiel #25
0
    def __init__(self,
                 streamID,
                 write=False,
                 fields=None,
                 missingValues=None,
                 bookmark=None,
                 includeMS=True,
                 firstRecord=None):
        """
    streamID:
        CSV file name, input or output
    write:
        True or False, open for writing if True
    fields:
        a list of nupic.data.fieldmeta.FieldMetaInfo field descriptors, only
        applicable when write==True
    missingValues:
        what missing values should be replaced with?
    bookmark:
        a reference to the previous reader, if passed in, the records will be
        returned starting from the point where bookmark was requested. Either
        bookmark or firstRecord can be specified, not both. If bookmark is used,
        then firstRecord MUST be None.
    includeMS:
        If false, the microseconds portion is not included in the
        generated output file timestamp fields. This makes it compatible
        with reading in from Excel.
    firstRecord:
        0-based index of the first record to start reading from. Either bookmark
        or firstRecord can be specified, not both. If bookmark is used, then
        firstRecord MUST be None.

    Each field is a 3-tuple (name, type, special or FieldMetaSpecial.none)

    The name is the name of the field. The type is one of the constants in
    `FieldMetaType`. The special is one of the `FieldMetaSpecial` values
    that designate their field as the sequenceId, reset, timestamp, or category.
    With exception of multiple categories, there can be at most one of each.
    There may be multiple fields of type datetime, but no more than one of them
    may be the timestamp field (FieldMetaSpecial.timestamp). The sequence id
    field must be either a string or an int. The reset field must be an int (and
    must contain 0 or 1).

    The category field must be an int or space-separated list of ints, where
    the former represents single-label classification and the latter is for
    multi-label classification (e.g. "1 3 4" designates a record for labels 1,
    3, and 4). The number of categories is allowed to vary record to record;
    sensor regions represent non-categories with -1, thus the category values
    must be >= 0.

    The FileRecordStream iterates over the field names, types and specials and
    stores the information.
    """
        super(FileRecordStream, self).__init__()

        # Only bookmark or firstRow can be specified, not both
        if bookmark is not None and firstRecord is not None:
            raise RuntimeError(
                "Only bookmark or firstRecord can be specified, not both")

        if fields is None:
            fields = []
        if missingValues is None:
            missingValues = ['']

        # We'll be operating on csvs with arbitrarily long fields
        size = 2**27
        csv.field_size_limit(size)

        self._filename = streamID
        # We can't guarantee what system files are coming from, use universal
        # newlines
        self._write = write
        self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE
        self._file = open(self._filename, self._mode)
        self._sequences = set()
        self.rewindAtEOF = False

        if write:
            assert fields is not None
            assert isinstance(fields, (tuple, list))
            # Verify all fields are 3-tuple
            assert all(
                isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3
                for f in fields)
            names, types, specials = zip(*fields)
            self._writer = csv.writer(self._file)
        else:
            # Read header lines
            self._reader = csv.reader(self._file, dialect="excel")
            try:
                names = [n.strip() for n in self._reader.next()]
            except:
                raise Exception('The header line of the file %s contained a NULL byte' \
                                % self._filename)
            types = [t.strip() for t in self._reader.next()]
            specials = [s.strip() for s in self._reader.next()]

            # If there are no specials, this means there was a blank line
            if len(specials) == 0:
                specials = [""]

        if not len(names) == len(types) == len(specials):
            raise Exception('Invalid file format: different number of fields '
                            'in the header rows of file %s (%d, %d, %d)' %
                            (streamID, len(names), len(types), len(specials)))

        # Verify standard file format
        for t in types:
            if not FieldMetaType.isValid(t):
                raise Exception(
                    'Invalid file format for "%s" - field type "%s" '
                    'not a valid FieldMetaType' % (
                        self._filename,
                        t,
                    ))

        for s in specials:
            if not FieldMetaSpecial.isValid(s):
                raise Exception(
                    'Invalid file format. \'%s\' is not a valid special '
                    'flag' % s)

        self._fields = [
            FieldMetaInfo(*attrs) for attrs in zip(names, types, specials)
        ]
        self._fieldCount = len(self._fields)

        # Keep track on how many records have been read/written
        self._recordCount = 0

        self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if
                              FieldMetaSpecial.timestamp in specials else None)
        self._resetIdx = (specials.index(FieldMetaSpecial.reset)
                          if FieldMetaSpecial.reset in specials else None)
        self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if
                               FieldMetaSpecial.sequence in specials else None)
        self._categoryIdx = (specials.index(FieldMetaSpecial.category) if
                             FieldMetaSpecial.category in specials else None)
        self._learningIdx = (specials.index(FieldMetaSpecial.learning) if
                             FieldMetaSpecial.learning in specials else None)

        # keep track of the current sequence
        self._currSequence = None
        self._currTime = None

        if self._timeStampIdx:
            assert types[self._timeStampIdx] == FieldMetaType.datetime
        if self._sequenceIdIdx:
            assert types[self._sequenceIdIdx] in (FieldMetaType.string,
                                                  FieldMetaType.integer)
        if self._resetIdx:
            assert types[self._resetIdx] == FieldMetaType.integer
        if self._categoryIdx:
            assert types[self._categoryIdx] in (FieldMetaType.list,
                                                FieldMetaType.integer)
        if self._learningIdx:
            assert types[self._learningIdx] == FieldMetaType.integer

        # Convert the types to the actual types in order to convert the strings
        if self._mode == self._FILE_READ_MODE:
            m = {
                FieldMetaType.integer: intOrNone,
                FieldMetaType.float: floatOrNone,
                FieldMetaType.boolean: parseBool,
                FieldMetaType.string: unescape,
                FieldMetaType.datetime: parseTimestamp,
                FieldMetaType.sdr: parseSdr,
                FieldMetaType.list: parseStringList
            }
        else:
            if includeMS:
                datetimeFunc = serializeTimestamp
            else:
                datetimeFunc = serializeTimestampNoMS
            m = {
                FieldMetaType.integer: str,
                FieldMetaType.float: str,
                FieldMetaType.string: escape,
                FieldMetaType.boolean: str,
                FieldMetaType.datetime: datetimeFunc,
                FieldMetaType.sdr: serializeSdr,
                FieldMetaType.list: stripList
            }

        self._adapters = [m[t] for t in types]

        self._missingValues = missingValues

        #
        # If the bookmark is set, we need to skip over first N records
        #
        if bookmark is not None:
            rowsToSkip = self._getStartRow(bookmark)
        elif firstRecord is not None:
            rowsToSkip = firstRecord
        else:
            rowsToSkip = 0

        while rowsToSkip > 0:
            self.next()
            rowsToSkip -= 1

        # Dictionary to store record statistics (min and max of scalars for now)
        self._stats = None
Beispiel #26
0
 def testFieldMetaInfoRaisesValueErrorOnInvalidFieldSpecial(self):
     with self.assertRaises(ValueError):
         FieldMetaInfo("fieldName", FieldMetaType.integer, "bogus-special")
Beispiel #27
0
    def __openDatafile(self, modelResult):
        """Open the data file and write the header row"""

        # Write reset bit
        resetFieldMeta = FieldMetaInfo(name="reset",
                                       type=FieldMetaType.integer,
                                       special=FieldMetaSpecial.reset)

        self.__outputFieldsMeta.append(resetFieldMeta)

        # -----------------------------------------------------------------------
        # Write each of the raw inputs that go into the encoders
        rawInput = modelResult.rawInput
        rawFields = rawInput.keys()
        rawFields.sort()
        for field in rawFields:
            if field.startswith('_') or field == 'reset':
                continue
            value = rawInput[field]
            meta = FieldMetaInfo(name=field,
                                 type=FieldMetaType.string,
                                 special=FieldMetaSpecial.none)
            self.__outputFieldsMeta.append(meta)
            self._rawInputNames.append(field)

        # -----------------------------------------------------------------------
        # Handle each of the inference elements
        for inferenceElement, value in modelResult.inferences.iteritems():
            inferenceLabel = InferenceElement.getLabel(inferenceElement)

            # TODO: Right now we assume list inferences are associated with
            # The input field metadata
            if type(value) in (list, tuple):
                # Append input and prediction field meta-info
                self.__outputFieldsMeta.extend(
                    self.__getListMetaInfo(inferenceElement))

            elif isinstance(value, dict):
                self.__outputFieldsMeta.extend(
                    self.__getDictMetaInfo(inferenceElement, value))
            else:

                if InferenceElement.getInputElement(inferenceElement):
                    self.__outputFieldsMeta.append(
                        FieldMetaInfo(name=inferenceLabel + ".actual",
                                      type=FieldMetaType.string,
                                      special=''))
                self.__outputFieldsMeta.append(
                    FieldMetaInfo(name=inferenceLabel,
                                  type=FieldMetaType.string,
                                  special=''))

        if self.__metricNames:
            for metricName in self.__metricNames:
                metricField = FieldMetaInfo(name=metricName,
                                            type=FieldMetaType.float,
                                            special=FieldMetaSpecial.none)

                self.__outputFieldsMeta.append(metricField)

        # Create the inference directory for our experiment
        inferenceDir = _FileUtils.createExperimentInferenceDir(
            self.__experimentDir)

        # Consctruct the prediction dataset file path
        filename = (self.__label + "." +
                    opfutils.InferenceType.getLabel(self.__inferenceType) +
                    ".predictionLog.csv")
        self.__datasetPath = os.path.join(inferenceDir, filename)

        # Create the output dataset
        print "OPENING OUTPUT FOR PREDICTION WRITER AT: %r" % self.__datasetPath
        print "Prediction field-meta: %r" % (
            [tuple(i) for i in self.__outputFieldsMeta], )
        self.__dataset = FileRecordStream(streamID=self.__datasetPath,
                                          write=True,
                                          fields=self.__outputFieldsMeta)

        # Copy data from checkpoint cache
        if self.__checkpointCache is not None:
            self.__checkpointCache.seek(0)

            reader = csv.reader(self.__checkpointCache, dialect='excel')

            # Skip header row
            try:
                header = reader.next()
            except StopIteration:
                print "Empty record checkpoint initializer for %r" % (
                    self.__datasetPath, )
            else:
                assert tuple(self.__dataset.getFieldNames()) == tuple(header), \
                  "dataset.getFieldNames(): %r; predictionCheckpointFieldNames: %r" % (
                  tuple(self.__dataset.getFieldNames()), tuple(header))

            # Copy the rows from checkpoint
            numRowsCopied = 0
            while True:
                try:
                    row = reader.next()
                except StopIteration:
                    break

                #print "DEBUG: restoring row from checkpoint: %r" % (row,)

                self.__dataset.appendRecord(row)
                numRowsCopied += 1

            self.__dataset.flush()

            print "Restored %d rows from checkpoint for %r" % (
                numRowsCopied, self.__datasetPath)

            # Dispose of our checkpoint cache
            self.__checkpointCache.close()
            self.__checkpointCache = None

        return
Beispiel #28
0
    def testMissingValues(self):

        print "Beginning Missing Data test..."
        filename = _getTempFileName()

        # Some values missing of each type
        # read dataset from disk, retrieve values
        # string should return empty string, numeric types sentinelValue

        print 'Creating tempfile:', filename

        # write dataset to disk with float, int, and string fields
        fields = [
            FieldMetaInfo('timestamp', FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp),
            FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none),
            FieldMetaInfo('integer', FieldMetaType.integer,
                          FieldMetaSpecial.none),
            FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none)
        ]
        s = FileRecordStream(streamID=filename, write=True, fields=fields)

        # Records
        records = ([datetime(day=1, month=3, year=2010), 'rec_1', 5, 6.5], [
            datetime(day=2, month=3, year=2010), '', 8, 7.5
        ], [datetime(day=3, month=3, year=2010), 'rec_3', '', 8.5], [
            datetime(day=4, month=3, year=2010), 'rec_4', 12, ''
        ], [datetime(day=5, month=3, year=2010), 'rec_5', -87657496599, 6.5], [
            datetime(day=6, month=3, year=2010), 'rec_6', 12, -87657496599
        ], [datetime(day=6, month=3, year=2010),
            str(-87657496599), 12, 6.5])

        for r in records:
            s.appendRecord(list(r))

        s.close()

        # Read the standard file
        s = FileRecordStream(streamID=filename, write=False)

        fieldsRead = s.getFields()
        self.assertEqual(fields, fieldsRead)

        recordsRead = []
        while True:
            r = s.getNextRecord()
            if r is None:
                break
            print 'Reading record ...'
            print r
            recordsRead.append(r)

        # sort the records by date, so we know for sure which is which
        sorted(recordsRead, key=lambda rec: rec[0])

        # empty string
        self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[1][1])

        # missing int
        self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[2][2])

        # missing float
        self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[3][3])

        # sentinel value in input handled correctly for int field
        self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[4][2])

        # sentinel value in input handled correctly for float field
        self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[5][3])

        # sentinel value in input handled correctly for string field
        # this should leave the string as-is, since a missing string
        # is encoded not with a sentinel value but with an empty string
        self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[6][1])
    def loadModel(self):
        """ Load the model and construct the input row encoder. On success,
    the loaded model may be accessed via the `model` attribute

    :raises: model_checkpoint_mgr.ModelNotFound
    """
        if self._model is not None:
            return

        modelDefinition = None

        # Load the model
        try:
            self._model = self._checkpointMgr.load(self._modelID)
            self._hasCheckpoint = True
        except model_checkpoint_mgr.ModelNotFound:
            # So, we didn't have a checkpoint... try to create our model from model
            # definition params
            self._hasCheckpoint = False
            try:
                modelDefinition = self._checkpointMgr.loadModelDefinition(
                    self._modelID)
            except model_checkpoint_mgr.ModelNotFound:
                raise _ModelRunnerError(errno=htmengineerrno.ERR_NO_SUCH_MODEL,
                                        msg="modelID=%s not found" %
                                        (self._modelID))
            else:
                modelParams = modelDefinition["modelParams"]

            # TODO: when creating the model from params, do we need to call
            #   its model.setFieldStatistics() method? And where will the
            #   fieldStats come from, anyway?
            self._model = ModelFactory.create(
                modelConfig=modelParams["modelConfig"])
            self._model.enableLearning()
            self._model.enableInference(modelParams["inferenceArgs"])

        # Construct the object for converting a flat input row into a format
        # that is consumable by an OPF model
        try:
            if modelDefinition is None:
                modelDefinition = self._checkpointMgr.loadModelDefinition(
                    self._modelID)
        except model_checkpoint_mgr.ModelNotFound:
            raise _ModelRunnerError(errno=htmengineerrno.ERR_NO_SUCH_MODEL,
                                    msg="modelID=%s not found" %
                                    (self._modelID))
        else:
            inputSchema = modelDefinition["inputSchema"]

        # Convert it to a sequence of FieldMetaInfo instances
        # NOTE: if loadMetaInfo didn't raise, we expect "inputSchema" to be
        #   present; it would be a logic error if it isn't.
        inputFieldsMeta = tuple(FieldMetaInfo(*f) for f in inputSchema)

        self._inputRowEncoder = _InputRowEncoder(fieldsMeta=inputFieldsMeta)

        # If the checkpoint was incremental, feed the cached data into the model
        for inputSample in self._inputSamplesSinceLastFullCheckpoint:
            # Convert a flat input sample into a format that is consumable by an OPF
            # model
            self._inputRowEncoder.appendRecord(inputSample)

            # Infer
            self._model.run(self._inputRowEncoder.getNextRecordDict())
Beispiel #30
0
    def testGetNextRecordDictWithResetFieldWithoutSequenceField(self):
        fields = [
            FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none),
            FieldMetaInfo('timestamp', FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp),
            FieldMetaInfo('integer', FieldMetaType.integer,
                          FieldMetaSpecial.none),
            FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none),
            FieldMetaInfo('reset', FieldMetaType.integer,
                          FieldMetaSpecial.reset),
            FieldMetaInfo('categories', FieldMetaType.list,
                          FieldMetaSpecial.category)
        ]

        stream = self.MyRecordStream(fields)

        with mock.patch.object(stream,
                               'getNextRecord',
                               autospec=True,
                               return_value=[
                                   'rec_1',
                                   datetime(day=1, month=3, year=2010), 5, 6.5,
                                   1, [0, 1, 2]
                               ]):

            result = stream.getNextRecordDict()

            self.assertEqual(
                result, {
                    'name': 'rec_1',
                    'timestamp': datetime(2010, 3, 1, 0, 0),
                    'integer': 5,
                    'real': 6.5,
                    'reset': 1,
                    'categories': [0, 1, 2],
                    '_category': [0, 1, 2],
                    '_reset': 1,
                    '_sequenceId': 0,
                    '_timestamp': datetime(2010, 3, 1, 0, 0),
                    '_timestampRecordIdx': None
                })

        # One more time to verify incremeting sequence id
        with mock.patch.object(stream,
                               'getNextRecord',
                               autospec=True,
                               return_value=[
                                   'rec_2',
                                   datetime(day=2, month=3, year=2010), 5, 6.5,
                                   1, [0, 1, 2]
                               ]):

            result = stream.getNextRecordDict()

            self.assertEqual(
                result, {
                    'name': 'rec_2',
                    'timestamp': datetime(2010, 3, 2, 0, 0),
                    'integer': 5,
                    'real': 6.5,
                    'reset': 1,
                    'categories': [0, 1, 2],
                    '_category': [0, 1, 2],
                    '_reset': 1,
                    '_sequenceId': 1,
                    '_timestamp': datetime(2010, 3, 2, 0, 0),
                    '_timestampRecordIdx': None
                })

        # Now with reset turned off, expecting no change to sequence id
        with mock.patch.object(stream,
                               'getNextRecord',
                               autospec=True,
                               return_value=[
                                   'rec_3',
                                   datetime(day=3, month=3, year=2010), 5, 6.5,
                                   0, [0, 1, 2]
                               ]):

            result = stream.getNextRecordDict()

            self.assertEqual(
                result, {
                    'name': 'rec_3',
                    'timestamp': datetime(2010, 3, 3, 0, 0),
                    'integer': 5,
                    'real': 6.5,
                    'reset': 0,
                    'categories': [0, 1, 2],
                    '_category': [0, 1, 2],
                    '_reset': 0,
                    '_sequenceId': 1,
                    '_timestamp': datetime(2010, 3, 3, 0, 0),
                    '_timestampRecordIdx': None
                })

        # Now check that rewind resets sequence id
        with mock.patch.object(stream,
                               'getNextRecord',
                               autospec=True,
                               return_value=[
                                   'rec_4',
                                   datetime(day=4, month=3, year=2010), 5, 6.5,
                                   1, [0, 1, 2]
                               ]):
            stream.rewind()
            result = stream.getNextRecordDict()

            self.assertEqual(
                result, {
                    'name': 'rec_4',
                    'timestamp': datetime(2010, 3, 4, 0, 0),
                    'integer': 5,
                    'real': 6.5,
                    'reset': 1,
                    'categories': [0, 1, 2],
                    '_category': [0, 1, 2],
                    '_reset': 1,
                    '_sequenceId': 0,
                    '_timestamp': datetime(2010, 3, 4, 0, 0),
                    '_timestampRecordIdx': None
                })