def testBadDataset(self): filename = _getTempFileName() print 'Creating tempfile:', filename # Write bad dataset with records going backwards in time fields = [ FieldMetaInfo('timestamp', FieldMetaType.datetime, FieldMetaSpecial.timestamp) ] o = FileRecordStream(streamID=filename, write=True, fields=fields) # Records records = ([datetime(day=3, month=3, year=2010)], [datetime(day=2, month=3, year=2010)]) o.appendRecord(records[0]) o.appendRecord(records[1]) o.close() # Write bad dataset with broken sequences fields = [ FieldMetaInfo('sid', FieldMetaType.integer, FieldMetaSpecial.sequence) ] o = FileRecordStream(streamID=filename, write=True, fields=fields) # Records records = ([1], [2], [1]) o.appendRecord(records[0]) o.appendRecord(records[1]) self.assertRaises(Exception, o.appendRecord, (records[2], )) o.close()
def __getListMetaInfo(self, inferenceElement): """ Get field metadata information for inferences that are of list type TODO: Right now we assume list inferences are associated with the input field metadata """ fieldMetaInfo = [] inferenceLabel = InferenceElement.getLabel(inferenceElement) for inputFieldMeta in self.__inputFieldsMeta: if InferenceElement.getInputElement(inferenceElement): outputFieldMeta = FieldMetaInfo( name=inputFieldMeta.name + ".actual", type=inputFieldMeta.type, special=inputFieldMeta.special ) predictionField = FieldMetaInfo( name=inputFieldMeta.name + "." + inferenceLabel, type=inputFieldMeta.type, special=inputFieldMeta.special ) fieldMetaInfo.append(outputFieldMeta) fieldMetaInfo.append(predictionField) return fieldMetaInfo
def testEncoderWithoutResetAndSequenceFields(self): fields = [ FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none), FieldMetaInfo('timestamp', FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo('integer', FieldMetaType.integer, FieldMetaSpecial.none), FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none), FieldMetaInfo('categories', FieldMetaType.list, FieldMetaSpecial.category) ] encoder = ModelRecordEncoder(fields=fields) result = encoder.encode( ['rec_1', datetime(day=1, month=3, year=2010), 5, 6.5, [0, 1, 2]]) self.assertEqual( result, { 'name': 'rec_1', 'timestamp': datetime(2010, 3, 1, 0, 0), 'integer': 5, 'real': 6.5, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 0, '_sequenceId': 0, '_timestamp': datetime(2010, 3, 1, 0, 0), '_timestampRecordIdx': None }) # One more time to verify that sequence id is still 0 result = encoder.encode( ['rec_2', datetime(day=2, month=3, year=2010), 5, 6.5, [0, 1, 2]]) self.assertEqual( result, { 'name': 'rec_2', 'timestamp': datetime(2010, 3, 2, 0, 0), 'integer': 5, 'real': 6.5, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 0, '_sequenceId': 0, '_timestamp': datetime(2010, 3, 2, 0, 0), '_timestampRecordIdx': None })
def __init__(self, sequence_model, bookmark=None): super(TimeSeriesStream, self).__init__() self.sequence = sequence_model self._fields = [FieldMetaInfo("series", "float", "")] self._fieldCount = len(self._fields) if bookmark is not None: self.sequence.set_theta(bookmark)
def testFieldMetaInfo(self): # Create a single FieldMetaInfo instance from a File field"s meta-data tuple e = ("pounds", FieldMetaType.float, FieldMetaSpecial.none) m = FieldMetaInfo.createFromFileFieldElement(e) self.assertEqual(e, m) # Create a list of FieldMetaInfo instances from a list of File meta-data # tuples el = [("pounds", FieldMetaType.float, FieldMetaSpecial.none), ("price", FieldMetaType.float, FieldMetaSpecial.none), ("id", FieldMetaType.string, FieldMetaSpecial.sequence), ("date", FieldMetaType.datetime, FieldMetaSpecial.timestamp), ] ml = FieldMetaInfo.createListFromFileFieldList(el) self.assertEqual(el, ml)
def __getDictMetaInfo(self, inferenceElement, inferenceDict): """Get field metadate information for inferences that are of dict type""" fieldMetaInfo = [] inferenceLabel = InferenceElement.getLabel(inferenceElement) if InferenceElement.getInputElement(inferenceElement): fieldMetaInfo.append(FieldMetaInfo(name=inferenceLabel+".actual", type=FieldMetaType.string, special = '')) keys = sorted(inferenceDict.keys()) for key in keys: fieldMetaInfo.append(FieldMetaInfo(name=inferenceLabel+"."+str(key), type=FieldMetaType.string, special='')) return fieldMetaInfo
def getDatasetFieldMetaData(self): """ [virtual method override] Returns: a tuple of dataset field metadata descriptors that are arranged in the same order as the columns in the dataset. Each field metadata descriptor is of type nupic.data.fieldmeta.FieldMetaInfo """ return FieldMetaInfo.createListFromFileFieldList( self._reader.getFields())
def testRewindBeforeModelRecordEncoderIsCreated(self): fields = [ FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none), ] stream = self.MyRecordStream(fields) # Check that it doesn't crash by trying to operate on an absent encoder self.assertIsNone(stream._modelRecordEncoder) stream.rewind()
def _testTemporalShift(): """ Test to see if the metrics manager correctly shifts records for multistep prediction cases """ print "*Testing Multistep temporal shift*..." from nupic.data.field_meta import (FieldMetaInfo, FieldMetaType, FieldMetaSpecial) from nupic.frameworks.opf.metrics import MetricSpec from nupic.frameworks.opf.opf_utils import ModelResult, SensorInput onlineMetrics = () modelFieldMetaInfo = (FieldMetaInfo(name='consumption', type=FieldMetaType.float, special=FieldMetaSpecial.none), ) mgr = MetricsManager(metricSpecs=onlineMetrics, fieldInfo=modelFieldMetaInfo, inferenceType=InferenceType.TemporalMultiStep) groundTruths = [{'consumption': i} for i in range(10)] oneStepInfs = reversed(range(10)) threeStepInfs = range(5, 15) for iterNum, gt, os, ts in zip(xrange(10), groundTruths, oneStepInfs, threeStepInfs): inferences = {InferenceElement.multiStepPredictions: {1: os, 3: ts}} sensorInput = SensorInput(dataDict=[gt]) result = ModelResult(sensorInput=sensorInput, inferences=inferences) mgr.update(result) assert mgr._getGroundTruth( InferenceElement.multiStepPredictions)[0] == gt if iterNum < 1: #assert mgr._getInference(InferenceElement.multiStepPredictions) is None assert mgr._getInference( InferenceElement.multiStepPredictions)[1] is None else: prediction = mgr._getInference( InferenceElement.multiStepPredictions)[1] assert prediction == 10 - iterNum if iterNum < 3: inference = mgr._getInference( InferenceElement.multiStepPredictions) assert inference is None or inference[3] is None else: prediction = mgr._getInference( InferenceElement.multiStepPredictions)[3] assert prediction == iterNum + 2
def testEncoderWithSequenceAndResetFields(self): fields = [ FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none), FieldMetaInfo('timestamp', FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo('integer', FieldMetaType.integer, FieldMetaSpecial.none), FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none), FieldMetaInfo('reset', FieldMetaType.integer, FieldMetaSpecial.reset), FieldMetaInfo('sid', FieldMetaType.string, FieldMetaSpecial.sequence), FieldMetaInfo('categories', FieldMetaType.list, FieldMetaSpecial.category) ] encoder = ModelRecordEncoder(fields=fields) result = encoder.encode([ 'rec_1', datetime(day=1, month=3, year=2010), 5, 6.5, 1, 99, [0, 1, 2] ]) self.assertEqual( result, { 'name': 'rec_1', 'timestamp': datetime(2010, 3, 1, 0, 0), 'integer': 5, 'real': 6.5, 'reset': 1, 'sid': 99, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 1, '_sequenceId': 99, '_timestamp': datetime(2010, 3, 1, 0, 0), '_timestampRecordIdx': None })
def _testMetricsMgr(): print "*Testing Metrics Managers*..." from nupic.data.field_meta import (FieldMetaInfo, FieldMetaType, FieldMetaSpecial) from nupic.frameworks.opf.metrics import MetricSpec from nupic.frameworks.opf.opf_utils import ModelResult, SensorInput onlineMetrics = (MetricSpec(metric="aae", inferenceElement='', \ field="consumption", params={}),) print "TESTING METRICS MANAGER (BASIC PLUMBING TEST)..." modelFieldMetaInfo = (FieldMetaInfo(name='temperature', type=FieldMetaType.float, special=FieldMetaSpecial.none), FieldMetaInfo(name='consumption', type=FieldMetaType.float, special=FieldMetaSpecial.none)) # ----------------------------------------------------------------------- # Test to make sure that invalid InferenceElements are caught try: MetricsManager(metricSpecs=onlineMetrics, fieldInfo=modelFieldMetaInfo, inferenceType=InferenceType.TemporalNextStep) except ValueError: print "Caught bad inference element: PASS" print onlineMetrics = (MetricSpec(metric="aae", inferenceElement=InferenceElement.prediction, field="consumption", params={}), ) temporalMetrics = MetricsManager( metricSpecs=onlineMetrics, fieldInfo=modelFieldMetaInfo, inferenceType=InferenceType.TemporalNextStep) inputs = [ { 'groundTruthRow': [9, 7], 'predictionsDict': { InferenceType.TemporalNextStep: [12, 17] } }, { 'groundTruthRow': [12, 17], 'predictionsDict': { InferenceType.TemporalNextStep: [14, 19] } }, { 'groundTruthRow': [14, 20], 'predictionsDict': { InferenceType.TemporalNextStep: [16, 21] } }, { 'groundTruthRow': [9, 7], 'predictionsDict': { InferenceType.TemporalNextStep: None } }, ] for element in inputs: groundTruthRow = element['groundTruthRow'] tPredictionRow = element['predictionsDict'][ InferenceType.TemporalNextStep] result = ModelResult(sensorInput=SensorInput(dataRow=groundTruthRow, dataEncodings=None, sequenceReset=0, category=None), inferences={'prediction': tPredictionRow}) temporalMetrics.update(result) assert temporalMetrics.getMetrics().values()[0] == 15.0 / 3.0, \ "Expected %f, got %f" %(15.0/3.0, temporalMetrics.getMetrics().values()[0]) print "ok" return
def __openDatafile(self, modelResult): """Open the data file and write the header row""" # Write reset bit resetFieldMeta = FieldMetaInfo(name="reset", type=FieldMetaType.integer, special=FieldMetaSpecial.reset) self.__outputFieldsMeta.append(resetFieldMeta) # ----------------------------------------------------------------------- # Write each of the raw inputs that go into the encoders rawInput = modelResult.rawInput rawFields = list(rawInput.keys()) rawFields.sort() for field in rawFields: if field.startswith('_') or field == 'reset': continue value = rawInput[field] meta = FieldMetaInfo(name=field, type=FieldMetaType.string, special=FieldMetaSpecial.none) self.__outputFieldsMeta.append(meta) self._rawInputNames.append(field) # ----------------------------------------------------------------------- # Handle each of the inference elements for inferenceElement, value in modelResult.inferences.items(): inferenceLabel = InferenceElement.getLabel(inferenceElement) # TODO: Right now we assume list inferences are associated with # The input field metadata if type(value) in (list, tuple): # Append input and prediction field meta-info self.__outputFieldsMeta.extend( self.__getListMetaInfo(inferenceElement)) elif isinstance(value, dict): self.__outputFieldsMeta.extend( self.__getDictMetaInfo(inferenceElement, value)) else: if InferenceElement.getInputElement(inferenceElement): self.__outputFieldsMeta.append( FieldMetaInfo(name=inferenceLabel + ".actual", type=FieldMetaType.string, special='')) self.__outputFieldsMeta.append( FieldMetaInfo(name=inferenceLabel, type=FieldMetaType.string, special='')) if self.__metricNames: for metricName in self.__metricNames: metricField = FieldMetaInfo(name=metricName, type=FieldMetaType.float, special=FieldMetaSpecial.none) self.__outputFieldsMeta.append(metricField) # Create the inference directory for our experiment inferenceDir = _FileUtils.createExperimentInferenceDir( self.__experimentDir) # Consctruct the prediction dataset file path filename = (self.__label + "." + opf_utils.InferenceType.getLabel(self.__inferenceType) + ".predictionLog.csv") self.__datasetPath = os.path.join(inferenceDir, filename) # Create the output dataset print("OPENING OUTPUT FOR PREDICTION WRITER AT: %r" % self.__datasetPath) print("Prediction field-meta: %r" % ([tuple(i) for i in self.__outputFieldsMeta], )) self.__dataset = FileRecordStream(streamID=self.__datasetPath, write=True, fields=self.__outputFieldsMeta) # Copy data from checkpoint cache if self.__checkpointCache is not None: self.__checkpointCache.seek(0) reader = csv.reader(self.__checkpointCache, dialect='excel') # Skip header row try: header = next(reader) except StopIteration: print("Empty record checkpoint initializer for %r" % (self.__datasetPath, )) else: assert tuple(self.__dataset.getFieldNames()) == tuple(header), \ "dataset.getFieldNames(): %r; predictionCheckpointFieldNames: %r" % ( tuple(self.__dataset.getFieldNames()), tuple(header)) # Copy the rows from checkpoint numRowsCopied = 0 while True: try: row = next(reader) except StopIteration: break #print "DEBUG: restoring row from checkpoint: %r" % (row,) self.__dataset.appendRecord(row) numRowsCopied += 1 self.__dataset.flush() print("Restored %d rows from checkpoint for %r" % (numRowsCopied, self.__datasetPath)) # Dispose of our checkpoint cache self.__checkpointCache.close() self.__checkpointCache = None return
def getDatasetFieldMetaData(self): return FieldMetaInfo.createListFromFileFieldList( self._reader.getFields())
def __init__(self, streamID, write=False, fields=None, missingValues=None, bookmark=None, includeMS=True, firstRecord=None): super(FileRecordStream, self).__init__() # Only bookmark or firstRow can be specified, not both if bookmark is not None and firstRecord is not None: raise RuntimeError( "Only bookmark or firstRecord can be specified, not both") if fields is None: fields = [] if missingValues is None: missingValues = [''] # We'll be operating on csvs with arbitrarily long fields size = 2**27 csv.field_size_limit(size) self._filename = streamID # We can't guarantee what system files are coming from, use universal # newlines self._write = write self._mode = self._FILE_WRITE_MODE if write else self._FILE_READ_MODE self._file = open(self._filename, self._mode) self._sequences = set() self.rewindAtEOF = False if write: assert fields is not None assert isinstance(fields, (tuple, list)) # Verify all fields are 3-tuple assert all( isinstance(f, (tuple, FieldMetaInfo)) and len(f) == 3 for f in fields) names, types, specials = zip(*fields) self._writer = csv.writer(self._file) else: # Read header lines self._reader = csv.reader(self._file, dialect="excel") try: names = [n.strip() for n in self._reader.next()] except: raise Exception('The header line of the file %s contained a NULL byte' \ % self._filename) types = [t.strip() for t in self._reader.next()] specials = [s.strip() for s in self._reader.next()] # If there are no specials, this means there was a blank line if len(specials) == 0: specials = [""] if not len(names) == len(types) == len(specials): raise Exception('Invalid file format: different number of fields ' 'in the header rows of file %s (%d, %d, %d)' % (streamID, len(names), len(types), len(specials))) # Verify standard file format for t in types: if not FieldMetaType.isValid(t): raise Exception( 'Invalid file format for "%s" - field type "%s" ' 'not a valid FieldMetaType' % ( self._filename, t, )) for s in specials: if not FieldMetaSpecial.isValid(s): raise Exception( 'Invalid file format. \'%s\' is not a valid special ' 'flag' % s) self._fields = [ FieldMetaInfo(*attrs) for attrs in zip(names, types, specials) ] self._fieldCount = len(self._fields) # Keep track on how many records have been read/written self._recordCount = 0 self._timeStampIdx = (specials.index(FieldMetaSpecial.timestamp) if FieldMetaSpecial.timestamp in specials else None) self._resetIdx = (specials.index(FieldMetaSpecial.reset) if FieldMetaSpecial.reset in specials else None) self._sequenceIdIdx = (specials.index(FieldMetaSpecial.sequence) if FieldMetaSpecial.sequence in specials else None) self._categoryIdx = (specials.index(FieldMetaSpecial.category) if FieldMetaSpecial.category in specials else None) self._learningIdx = (specials.index(FieldMetaSpecial.learning) if FieldMetaSpecial.learning in specials else None) # keep track of the current sequence self._currSequence = None self._currTime = None if self._timeStampIdx: assert types[self._timeStampIdx] == FieldMetaType.datetime if self._sequenceIdIdx: assert types[self._sequenceIdIdx] in (FieldMetaType.string, FieldMetaType.integer) if self._resetIdx: assert types[self._resetIdx] == FieldMetaType.integer if self._categoryIdx: assert types[self._categoryIdx] in (FieldMetaType.list, FieldMetaType.integer) if self._learningIdx: assert types[self._learningIdx] == FieldMetaType.integer # Convert the types to the actual types in order to convert the strings if self._mode == self._FILE_READ_MODE: m = { FieldMetaType.integer: intOrNone, FieldMetaType.float: floatOrNone, FieldMetaType.boolean: parseBool, FieldMetaType.string: unescape, FieldMetaType.datetime: parseTimestamp, FieldMetaType.sdr: parseSdr, FieldMetaType.list: parseStringList } else: if includeMS: datetimeFunc = serializeTimestamp else: datetimeFunc = serializeTimestampNoMS m = { FieldMetaType.integer: str, FieldMetaType.float: str, FieldMetaType.string: escape, FieldMetaType.boolean: str, FieldMetaType.datetime: datetimeFunc, FieldMetaType.sdr: serializeSdr, FieldMetaType.list: stripList } self._adapters = [m[t] for t in types] self._missingValues = missingValues # # If the bookmark is set, we need to skip over first N records # if bookmark is not None: rowsToSkip = self._getStartRow(bookmark) elif firstRecord is not None: rowsToSkip = firstRecord else: rowsToSkip = 0 while rowsToSkip > 0: self.next() rowsToSkip -= 1 # Dictionary to store record statistics (min and max of scalars for now) self._stats = None
def __init__(self, streamDef, bookmark=None, saveOutput=False, isBlocking=True, maxTimeout=0, eofOnTimeout=False): # Call superclass constructor super(StreamReader, self).__init__() loggerPrefix = 'com.numenta.nupic.data.StreamReader' self._logger = logging.getLogger(loggerPrefix) json_helpers.validate(streamDef, schemaPath=pkg_resources.resource_filename( jsonschema.__name__, "stream_def.json")) assert len(streamDef['streams']) == 1, "Only 1 source stream is supported" # Save constructor args sourceDict = streamDef['streams'][0] self._recordCount = 0 self._eofOnTimeout = eofOnTimeout self._logger.debug('Reading stream with the def: %s', sourceDict) # Dictionary to store record statistics (min and max of scalars for now) self._stats = None # --------------------------------------------------------------------- # Get the stream definition params # Limiting window of the stream. It would not return any records until # 'first_record' ID is read (or very first with the ID above that). The # stream will return EOS once it reads record with ID 'last_record' or # above (NOTE: the name 'lastRecord' is misleading because it is NOT # inclusive). firstRecordIdx = sourceDict.get('first_record', None) self._sourceLastRecordIdx = sourceDict.get('last_record', None) # If a bookmark was given, then override first_record from the stream # definition. if bookmark is not None: firstRecordIdx = None # Column names must be provided in the streamdef json # Special case is ['*'], meaning all available names from the record stream self._streamFieldNames = sourceDict.get('columns', None) if self._streamFieldNames != None and self._streamFieldNames[0] == '*': self._needFieldsFiltering = False else: self._needFieldsFiltering = True # Types must be specified in streamdef json, or in case of the # file_recod_stream types could be implicit from the file streamFieldTypes = sourceDict.get('types', None) self._logger.debug('Types from the def: %s', streamFieldTypes) # Validate that all types are valid if streamFieldTypes is not None: for dataType in streamFieldTypes: assert FieldMetaType.isValid(dataType) # Reset, sequence and time fields might be provided by streamdef json streamResetFieldName = streamDef.get('resetField', None) streamTimeFieldName = streamDef.get('timeField', None) streamSequenceFieldName = streamDef.get('sequenceIdField', None) self._logger.debug('r, t, s fields: %s, %s, %s', streamResetFieldName, streamTimeFieldName, streamSequenceFieldName) # ======================================================================= # Open up the underlying record store dataUrl = sourceDict.get('source', None) assert dataUrl is not None self._recordStore = self._openStream(dataUrl, isBlocking, maxTimeout, bookmark, firstRecordIdx) assert self._recordStore is not None # ======================================================================= # Prepare the data structures we need for returning just the fields # the caller wants from each record recordStoreFields = self._recordStore.getFields() self._recordStoreFieldNames = self._recordStore.getFieldNames() if not self._needFieldsFiltering: self._streamFieldNames = self._recordStoreFieldNames # Build up the field definitions for each field. This is a list of tuples # of (name, type, special) self._streamFields = [] for dstIdx, name in enumerate(self._streamFieldNames): if name not in self._recordStoreFieldNames: raise RuntimeError("The column '%s' from the stream definition " "is not present in the underlying stream which has the following " "columns: %s" % (name, self._recordStoreFieldNames)) fieldIdx = self._recordStoreFieldNames.index(name) fieldType = recordStoreFields[fieldIdx].type fieldSpecial = recordStoreFields[fieldIdx].special # If the types or specials were defined in the stream definition, # then override what was found in the record store if streamFieldTypes is not None: fieldType = streamFieldTypes[dstIdx] if streamResetFieldName is not None and streamResetFieldName == name: fieldSpecial = FieldMetaSpecial.reset if streamTimeFieldName is not None and streamTimeFieldName == name: fieldSpecial = FieldMetaSpecial.timestamp if (streamSequenceFieldName is not None and streamSequenceFieldName == name): fieldSpecial = FieldMetaSpecial.sequence self._streamFields.append(FieldMetaInfo(name, fieldType, fieldSpecial)) # ======================================================================== # Create the aggregator which will handle aggregation of records before # returning them. self._aggregator = Aggregator( aggregationInfo=streamDef.get('aggregation', None), inputFields=recordStoreFields, timeFieldName=streamDef.get('timeField', None), sequenceIdFieldName=streamDef.get('sequenceIdField', None), resetFieldName=streamDef.get('resetField', None)) # We rely on the aggregator to tell us the bookmark of the last raw input # that contributed to the aggregated record self._aggBookmark = None # Compute the aggregation period in terms of months and seconds if 'aggregation' in streamDef: self._aggMonthsAndSeconds = nupic.support.aggregationToMonthsSeconds( streamDef.get('aggregation')) else: self._aggMonthsAndSeconds = None # ======================================================================== # Are we saving the generated output to a csv? if saveOutput: tmpDir = tempfile.mkdtemp() outFilename = os.path.join(tmpDir, "generated_output.csv") self._logger.info("StreamReader: Saving generated records to: '%s'" % outFilename) self._writer = FileRecordStream(streamID=outFilename, write=True, fields=self._streamFields) else: self._writer = None
def testEncoderWithResetFieldWithoutSequenceField(self): fields = [ FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none), FieldMetaInfo('timestamp', FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo('integer', FieldMetaType.integer, FieldMetaSpecial.none), FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none), FieldMetaInfo('reset', FieldMetaType.integer, FieldMetaSpecial.reset), FieldMetaInfo('categories', FieldMetaType.list, FieldMetaSpecial.category) ] encoder = ModelRecordEncoder(fields=fields) result = encoder.encode([ 'rec_1', datetime(day=1, month=3, year=2010), 5, 6.5, 1, [0, 1, 2] ]) self.assertEqual( result, { 'name': 'rec_1', 'timestamp': datetime(2010, 3, 1, 0, 0), 'integer': 5, 'real': 6.5, 'reset': 1, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 1, '_sequenceId': 0, '_timestamp': datetime(2010, 3, 1, 0, 0), '_timestampRecordIdx': None }) # One more time to verify incremeting sequence id result = encoder.encode([ 'rec_2', datetime(day=2, month=3, year=2010), 5, 6.5, 1, [0, 1, 2] ]) self.assertEqual( result, { 'name': 'rec_2', 'timestamp': datetime(2010, 3, 2, 0, 0), 'integer': 5, 'real': 6.5, 'reset': 1, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 1, '_sequenceId': 1, '_timestamp': datetime(2010, 3, 2, 0, 0), '_timestampRecordIdx': None }) # Now with reset turned off, expecting no change to sequence id result = encoder.encode([ 'rec_3', datetime(day=3, month=3, year=2010), 5, 6.5, 0, [0, 1, 2] ]) self.assertEqual( result, { 'name': 'rec_3', 'timestamp': datetime(2010, 3, 3, 0, 0), 'integer': 5, 'real': 6.5, 'reset': 0, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 0, '_sequenceId': 1, '_timestamp': datetime(2010, 3, 3, 0, 0), '_timestampRecordIdx': None }) # Now check that rewind resets sequence id encoder.rewind() result = encoder.encode([ 'rec_4', datetime(day=4, month=3, year=2010), 5, 6.5, 1, [0, 1, 2] ]) self.assertEqual( result, { 'name': 'rec_4', 'timestamp': datetime(2010, 3, 4, 0, 0), 'integer': 5, 'real': 6.5, 'reset': 1, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 1, '_sequenceId': 0, '_timestamp': datetime(2010, 3, 4, 0, 0), '_timestampRecordIdx': None })
def testMissingValues(self): print "Beginning Missing Data test..." filename = _getTempFileName() # Some values missing of each type # read dataset from disk, retrieve values # string should return empty string, numeric types sentinelValue print 'Creating tempfile:', filename # write dataset to disk with float, int, and string fields fields = [ FieldMetaInfo('timestamp', FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none), FieldMetaInfo('integer', FieldMetaType.integer, FieldMetaSpecial.none), FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none) ] s = FileRecordStream(streamID=filename, write=True, fields=fields) # Records records = ([datetime(day=1, month=3, year=2010), 'rec_1', 5, 6.5], [ datetime(day=2, month=3, year=2010), '', 8, 7.5 ], [datetime(day=3, month=3, year=2010), 'rec_3', '', 8.5], [ datetime(day=4, month=3, year=2010), 'rec_4', 12, '' ], [datetime(day=5, month=3, year=2010), 'rec_5', -87657496599, 6.5], [ datetime(day=6, month=3, year=2010), 'rec_6', 12, -87657496599 ], [datetime(day=6, month=3, year=2010), str(-87657496599), 12, 6.5]) for r in records: s.appendRecord(list(r)) s.close() # Read the standard file s = FileRecordStream(streamID=filename, write=False) fieldsRead = s.getFields() self.assertEqual(fields, fieldsRead) recordsRead = [] while True: r = s.getNextRecord() if r is None: break print 'Reading record ...' print r recordsRead.append(r) # sort the records by date, so we know for sure which is which sorted(recordsRead, key=lambda rec: rec[0]) # empty string self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[1][1]) # missing int self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[2][2]) # missing float self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[3][3]) # sentinel value in input handled correctly for int field self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[4][2]) # sentinel value in input handled correctly for float field self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[5][3]) # sentinel value in input handled correctly for string field # this should leave the string as-is, since a missing string # is encoded not with a sentinel value but with an empty string self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[6][1])
def testFieldMetaInfoRaisesValueErrorOnInvalidFieldType(self): with self.assertRaises(ValueError): FieldMetaInfo("fieldName", "bogus-type", FieldMetaSpecial.none)
def testEncoderWithSequenceFieldWithoutResetField(self): fields = [ FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none), FieldMetaInfo('timestamp', FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo('integer', FieldMetaType.integer, FieldMetaSpecial.none), FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none), FieldMetaInfo('sid', FieldMetaType.string, FieldMetaSpecial.sequence), FieldMetaInfo('categories', FieldMetaType.list, FieldMetaSpecial.category) ] encoder = ModelRecordEncoder(fields=fields) # _reset should be 1 the first time result = encoder.encode([ 'rec_1', datetime(day=1, month=3, year=2010), 5, 6.5, 99, [0, 1, 2] ]) self.assertEqual( result, { 'name': 'rec_1', 'timestamp': datetime(2010, 3, 1, 0, 0), 'integer': 5, 'real': 6.5, 'sid': 99, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 1, '_sequenceId': 99, '_timestamp': datetime(2010, 3, 1, 0, 0), '_timestampRecordIdx': None }) # _reset should be 0 when same sequence id is repeated result = encoder.encode([ 'rec_2', datetime(day=2, month=3, year=2010), 5, 6.5, 99, [0, 1, 2] ]) self.assertEqual( result, { 'name': 'rec_2', 'timestamp': datetime(2010, 3, 2, 0, 0), 'integer': 5, 'real': 6.5, 'sid': 99, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 0, '_sequenceId': 99, '_timestamp': datetime(2010, 3, 2, 0, 0), '_timestampRecordIdx': None }) # _reset should be 1 when sequence id changes result = encoder.encode([ 'rec_3', datetime(day=2, month=3, year=2010), 5, 6.5, 100, [0, 1, 2] ]) self.assertEqual( result, { 'name': 'rec_3', 'timestamp': datetime(2010, 3, 2, 0, 0), 'integer': 5, 'real': 6.5, 'sid': 100, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 1, '_sequenceId': 100, '_timestamp': datetime(2010, 3, 2, 0, 0), '_timestampRecordIdx': None })
def __init__(self, streamName, write=False, fields=None, missingValues=None, bookmark=None, includeMS=True, firstRecord=None, names=None): super(kafkaRecordStream, self).__init__() # Only bookmark or firstRow can be specified, not both if bookmark is not None and firstRecord is not None: raise RuntimeError( "Only bookmark or firstRecord can be specified, not both") if fields is None: fields = [] if missingValues is None: missingValues = [''] # We'll be operating on csvs with arbitrarily long fields size = 2**27 csv.field_size_limit(size) self.name = streamName specials = [] specials.append('') types = [] types.append('float') if names is None: names = [] names.append('cpu') elif len(names)==2: specials.append('') types.append('float') # We can't guarantee what system files are coming from, use universal # newlines self._fields = [FieldMetaInfo(*attrs) for attrs in zip(names, types, specials)] self._fieldCount = len(self._fields) # Keep track on how many records have been read/written self._recordCount = 0 # keep track of the current sequence self._currSequence = None self._currTime = None self._timeStampIdx = None self._sequenceIdIdx = None self._resetIdx = None self._categoryIdx = None self._learningIdx = None if self._timeStampIdx: assert types[self._timeStampIdx] == FieldMetaType.datetime if self._sequenceIdIdx: assert types[self._sequenceIdIdx] in (FieldMetaType.string, FieldMetaType.integer) if self._resetIdx: assert types[self._resetIdx] == FieldMetaType.integer if self._categoryIdx: assert types[self._categoryIdx] in (FieldMetaType.list, FieldMetaType.integer) if self._learningIdx: assert types[self._learningIdx] == FieldMetaType.integer # Convert the types to the actual types in order to convert the strings m = {FieldMetaType.integer: intOrNone, FieldMetaType.float: floatOrNone, FieldMetaType.boolean: parseBool, FieldMetaType.string: unescape, FieldMetaType.datetime: parseTimestamp, FieldMetaType.sdr: parseSdr, FieldMetaType.list: parseStringList} self._adapters = [m[t] for t in types] self._missingValues = missingValues # Dictionary to store record statistics (min and max of scalars for now) self._stats = None
def testBasic(self): """Runs basic FileRecordStream tests.""" filename = _getTempFileName() # Write a standard file fields = [ FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none), FieldMetaInfo('timestamp', FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo('integer', FieldMetaType.integer, FieldMetaSpecial.none), FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none), FieldMetaInfo('reset', FieldMetaType.integer, FieldMetaSpecial.reset), FieldMetaInfo('sid', FieldMetaType.string, FieldMetaSpecial.sequence), FieldMetaInfo('categoryField', FieldMetaType.integer, FieldMetaSpecial.category), ] fieldNames = [ 'name', 'timestamp', 'integer', 'real', 'reset', 'sid', 'categoryField' ] print 'Creating temp file:', filename with FileRecordStream(streamID=filename, write=True, fields=fields) as s: self.assertEqual(0, s.getDataRowCount()) # Records records = ([ 'rec_1', datetime(day=1, month=3, year=2010), 5, 6.5, 1, 'seq-1', 10 ], [ 'rec_2', datetime(day=2, month=3, year=2010), 8, 7.5, 0, 'seq-1', 11 ], [ 'rec_3', datetime(day=3, month=3, year=2010), 12, 8.5, 0, 'seq-1', 12 ]) self.assertEqual(fields, s.getFields()) self.assertEqual(0, s.getNextRecordIdx()) print 'Writing records ...' for r in records: print list(r) s.appendRecord(list(r)) self.assertEqual(3, s.getDataRowCount()) recordsBatch = ([ 'rec_4', datetime(day=4, month=3, year=2010), 2, 9.5, 1, 'seq-1', 13 ], [ 'rec_5', datetime(day=5, month=3, year=2010), 6, 10.5, 0, 'seq-1', 14 ], [ 'rec_6', datetime(day=6, month=3, year=2010), 11, 11.5, 0, 'seq-1', 15 ]) print 'Adding batch of records...' for rec in recordsBatch: print rec s.appendRecords(recordsBatch) self.assertEqual(6, s.getDataRowCount()) with FileRecordStream(filename) as s: # Read the standard file self.assertEqual(6, s.getDataRowCount()) self.assertEqual(fieldNames, s.getFieldNames()) # Note! this is the number of records read so far self.assertEqual(0, s.getNextRecordIdx()) readStats = s.getStats() print 'Got stats:', readStats expectedStats = { 'max': [None, None, 12, 11.5, 1, None, 15], 'min': [None, None, 2, 6.5, 0, None, 10] } self.assertEqual(expectedStats, readStats) readRecords = [] print 'Reading records ...' while True: r = s.getNextRecord() print r if r is None: break readRecords.append(r) allRecords = records + recordsBatch for r1, r2 in zip(allRecords, readRecords): self.assertEqual(r1, r2)
def testGetNextRecordDictWithResetFieldWithoutSequenceField(self): fields = [ FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none), FieldMetaInfo('timestamp', FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo('integer', FieldMetaType.integer, FieldMetaSpecial.none), FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none), FieldMetaInfo('reset', FieldMetaType.integer, FieldMetaSpecial.reset), FieldMetaInfo('categories', FieldMetaType.list, FieldMetaSpecial.category) ] stream = self.MyRecordStream(fields) with mock.patch.object(stream, 'getNextRecord', autospec=True, return_value=[ 'rec_1', datetime(day=1, month=3, year=2010), 5, 6.5, 1, [0, 1, 2] ]): result = stream.getNextRecordDict() self.assertEqual( result, { 'name': 'rec_1', 'timestamp': datetime(2010, 3, 1, 0, 0), 'integer': 5, 'real': 6.5, 'reset': 1, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 1, '_sequenceId': 0, '_timestamp': datetime(2010, 3, 1, 0, 0), '_timestampRecordIdx': None }) # One more time to verify incremeting sequence id with mock.patch.object(stream, 'getNextRecord', autospec=True, return_value=[ 'rec_2', datetime(day=2, month=3, year=2010), 5, 6.5, 1, [0, 1, 2] ]): result = stream.getNextRecordDict() self.assertEqual( result, { 'name': 'rec_2', 'timestamp': datetime(2010, 3, 2, 0, 0), 'integer': 5, 'real': 6.5, 'reset': 1, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 1, '_sequenceId': 1, '_timestamp': datetime(2010, 3, 2, 0, 0), '_timestampRecordIdx': None }) # Now with reset turned off, expecting no change to sequence id with mock.patch.object(stream, 'getNextRecord', autospec=True, return_value=[ 'rec_3', datetime(day=3, month=3, year=2010), 5, 6.5, 0, [0, 1, 2] ]): result = stream.getNextRecordDict() self.assertEqual( result, { 'name': 'rec_3', 'timestamp': datetime(2010, 3, 3, 0, 0), 'integer': 5, 'real': 6.5, 'reset': 0, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 0, '_sequenceId': 1, '_timestamp': datetime(2010, 3, 3, 0, 0), '_timestampRecordIdx': None }) # Now check that rewind resets sequence id with mock.patch.object(stream, 'getNextRecord', autospec=True, return_value=[ 'rec_4', datetime(day=4, month=3, year=2010), 5, 6.5, 1, [0, 1, 2] ]): stream.rewind() result = stream.getNextRecordDict() self.assertEqual( result, { 'name': 'rec_4', 'timestamp': datetime(2010, 3, 4, 0, 0), 'integer': 5, 'real': 6.5, 'reset': 1, 'categories': [0, 1, 2], '_category': [0, 1, 2], '_reset': 1, '_sequenceId': 0, '_timestamp': datetime(2010, 3, 4, 0, 0), '_timestampRecordIdx': None })
def testMultipleClasses(self): """Runs FileRecordStream tests with multiple category fields.""" filename = _getTempFileName() # Write a standard file fields = [ FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none), FieldMetaInfo('timestamp', FieldMetaType.datetime, FieldMetaSpecial.timestamp), FieldMetaInfo('integer', FieldMetaType.integer, FieldMetaSpecial.none), FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none), FieldMetaInfo('reset', FieldMetaType.integer, FieldMetaSpecial.reset), FieldMetaInfo('sid', FieldMetaType.string, FieldMetaSpecial.sequence), FieldMetaInfo('categories', FieldMetaType.list, FieldMetaSpecial.category) ] fieldNames = [ 'name', 'timestamp', 'integer', 'real', 'reset', 'sid', 'categories' ] print('Creating temp file:', filename) with FileRecordStream(streamID=filename, write=True, fields=fields) as s: self.assertEqual(0, s.getDataRowCount()) # Records records = ([ 'rec_1', datetime(day=1, month=3, year=2010), 5, 6.5, 1, 'seq-1', [0, 1, 2] ], [ 'rec_2', datetime(day=2, month=3, year=2010), 8, 7.5, 0, 'seq-1', [ 3, 4, 5, ] ], [ 'rec_3', datetime(day=3, month=3, year=2010), 2, 8.5, 0, 'seq-1', [ 6, 7, 8, ] ]) self.assertEqual(fields, s.getFields()) self.assertEqual(0, s.getNextRecordIdx()) print('Writing records ...') for r in records: print(r) s.appendRecord(r) self.assertEqual(3, s.getDataRowCount()) recordsBatch = ([ 'rec_4', datetime(day=4, month=3, year=2010), 2, 9.5, 1, 'seq-1', [2, 3, 4] ], [ 'rec_5', datetime(day=5, month=3, year=2010), 6, 10.5, 0, 'seq-1', [3, 4, 5] ], [ 'rec_6', datetime(day=6, month=3, year=2010), 11, 11.5, 0, 'seq-1', [4, 5, 6] ]) print('Adding batch of records...') for rec in recordsBatch: print(rec) s.appendRecords(recordsBatch) self.assertEqual(6, s.getDataRowCount()) with FileRecordStream(filename) as s: # Read the standard file self.assertEqual(6, s.getDataRowCount()) self.assertEqual(fieldNames, s.getFieldNames()) # Note! this is the number of records read so far self.assertEqual(0, s.getNextRecordIdx()) readStats = s.getStats() print('Got stats:', readStats) expectedStats = { 'max': [None, None, 11, 11.5, 1, None, None], 'min': [None, None, 2, 6.5, 0, None, None] } self.assertEqual(expectedStats, readStats) readRecords = [] print('Reading records ...') while True: r = s.getNextRecord() print(r) if r is None: break readRecords.append(r) expectedRecords = ([ 'rec_1', datetime(day=1, month=3, year=2010), 5, 6.5, 1, 'seq-1', [0, 1, 2] ], [ 'rec_2', datetime(day=2, month=3, year=2010), 8, 7.5, 0, 'seq-1', [3, 4, 5] ], [ 'rec_3', datetime(day=3, month=3, year=2010), 2, 8.5, 0, 'seq-1', [6, 7, 8] ], [ 'rec_4', datetime(day=4, month=3, year=2010), 2, 9.5, 1, 'seq-1', [2, 3, 4] ], [ 'rec_5', datetime(day=5, month=3, year=2010), 6, 10.5, 0, 'seq-1', [3, 4, 5] ], [ 'rec_6', datetime(day=6, month=3, year=2010), 11, 11.5, 0, 'seq-1', [4, 5, 6] ]) for r1, r2 in zip(expectedRecords, readRecords): self.assertEqual(r1, r2)
def testFieldMetaInfoRaisesValueErrorOnInvalidFieldSpecial(self): with self.assertRaises(ValueError): FieldMetaInfo("fieldName", FieldMetaType.integer, "bogus-special")
def getDatasetFieldMetaData(self): return FieldMetaInfo.createListFromFileFieldList(self._reader.getFields())