def test_GenerateDataset(self): dataset = 'extra/gym/gym.csv' print "Using input dataset: ", dataset gymFileds = None with FileRecordStream(findDataset(dataset)) as f: gymFields = f.getFieldNames() aggregationOptions = dict(timeField=gymFields.index('timestamp'), fields=[('attendeeCount', sum), ('consumption', sum), ('timestamp', lambda x: x[0])], hours=5) handle = \ tempfile.NamedTemporaryFile(prefix='agg_gym_hours_5', suffix='.csv', dir=os.path.dirname(findDataset(dataset))) outputFile = handle.name handle.close() print "Expected outputFile path: ", outputFile print "Files in the destination folder before the test:" print os.listdir(os.path.abspath(os.path.dirname( findDataset(dataset)))) if os.path.isfile(outputFile): print "Removing existing outputFile: ", outputFile os.remove(outputFile) self.assertFalse(os.path.exists(outputFile), msg="Shouldn't exist, but does: " + str(outputFile)) result = generateDataset(aggregationOptions, dataset, outputFile) print "generateDataset() returned: ", result f1 = os.path.abspath(os.path.normpath(result)) print "normalized generateDataset() result path: ", f1 f2 = os.path.normpath(outputFile) print "normalized outputFile path: ", f2 self.assertEqual(f1, f2) print "Checking for presence of outputFile: ", outputFile self.assertTrue( os.path.isfile(outputFile), msg= "Missing outputFile: %r; normalized generateDataset() result: %r" % (outputFile, f1)) print "Files in the destination folder after the test:" print os.listdir(os.path.abspath(os.path.dirname( findDataset(dataset)))) print result print '-' * 30 return
def test_GenerateDataset(self): dataset = 'extra/gym/gym.csv' print "Using input dataset: ", dataset gymFileds = None with FileRecordStream(findDataset(dataset)) as f: gymFields = f.getFieldNames() aggregationOptions = dict( timeField=gymFields.index('timestamp'), fields=[('attendeeCount', sum), ('consumption', sum), ('timestamp', lambda x: x[0])], hours=5 ) handle = \ tempfile.NamedTemporaryFile(prefix='agg_gym_hours_5', suffix='.csv', dir=os.path.dirname(findDataset(dataset))) outputFile = handle.name handle.close() print "Expected outputFile path: ", outputFile print "Files in the destination folder before the test:" print os.listdir(os.path.abspath(os.path.dirname(findDataset(dataset)))) if os.path.isfile(outputFile): print "Removing existing outputFile: ", outputFile os.remove(outputFile) self.assertFalse(os.path.exists(outputFile), msg="Shouldn't exist, but does: " + str(outputFile)) result = generateDataset(aggregationOptions, dataset, outputFile) print "generateDataset() returned: ", result f1 = os.path.abspath(os.path.normpath(result)) print "normalized generateDataset() result path: ", f1 f2 = os.path.normpath(outputFile) print "normalized outputFile path: ", f2 self.assertEqual(f1, f2) print "Checking for presence of outputFile: ", outputFile self.assertTrue( os.path.isfile(outputFile), msg="Missing outputFile: %r; normalized generateDataset() result: %r" % ( outputFile, f1)) print "Files in the destination folder after the test:" print os.listdir(os.path.abspath(os.path.dirname(findDataset(dataset)))) print result print '-' * 30 return
def getFilename(aggregationInfo, inputFile): """Generate the filename for aggregated dataset The filename is based on the input filename and the aggregation period. Returns the inputFile if no aggregation required (aggregation info has all 0's) """ # Find the actual file, with an absolute path inputFile = findDataset(inputFile) a = defaultdict(lambda: 0, aggregationInfo) outputDir = os.path.dirname(inputFile) outputFile = 'agg_%s' % os.path.splitext(os.path.basename(inputFile))[0] noAggregation = True timePeriods = 'years months weeks days '\ 'hours minutes seconds milliseconds microseconds' for k in timePeriods.split(): if a[k] > 0: noAggregation = False outputFile += '_%s_%d' % (k, a[k]) if noAggregation: return inputFile outputFile += '.csv' outputFile = os.path.join(outputDir, outputFile) return outputFile
def _createLPFNetwork(addSP=True, addTP=False): """Create an 'old-style' network ala LPF and return it.""" # ========================================================================== # Create the encoder and data source stuff we need to configure the sensor sensorParams = dict(verbosity=_VERBOSITY) encoder = _createEncoder() trainFile = findDataset("extra/gym/gym.csv") dataSource = FileRecordStream(streamID=trainFile) dataSource.setAutoRewind(True) # Create all the stuff we need to configure the CLARegion g_claConfig["spEnable"] = addSP g_claConfig["tpEnable"] = addTP claParams = _getCLAParams(encoder=encoder, config=g_claConfig) claParams["spSeed"] = g_claConfig["spSeed"] claParams["tpSeed"] = g_claConfig["tpSeed"] # ========================================================================== # Now create the network itself n = Network() n.addRegion("sensor", "py.RecordSensor", json.dumps(sensorParams)) sensor = n.regions["sensor"].getSelf() sensor.encoder = encoder sensor.dataSource = dataSource n.addRegion("level1", "py.CLARegion", json.dumps(claParams)) n.link("sensor", "level1", "UniformLink", "") n.link("sensor", "level1", "UniformLink", "", srcOutput="resetOut", destInput="resetIn") return n
def runHotgym(): model = createModel() model.enableInference({'predictedField': 'consumption'}) metricsManager = MetricsManager(_METRIC_SPECS, model.getFieldInfo(), model.getInferenceType()) with open (findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() reader.next() reader.next() for i, record in enumerate(reader, start=1): modelInput = dict(zip(headers, record)) modelInput["consumption"] = float(modelInput["consumption"]) modelInput["timestamp"] = datetime.datetime.strptime( modelInput["timestamp"], "%m/%d/%y %H:%M") result = model.run(modelInput) result.metrics = metricsManager.update(result) isLast = i == _NUM_RECORDS if i % 100 == 0 or isLast: _LOGGER.info("After %i records, 1-step altMAPE=%f", i, result.metrics["multiStepBestPredictions:multiStep:" "errorMetric='altMAPE':steps=1:window=1000:" "field=consumption"]) if isLast: break
def _createLPFNetwork(addSP = True, addTP = False): """Create an 'old-style' network ala LPF and return it.""" # ========================================================================== # Create the encoder and data source stuff we need to configure the sensor sensorParams = dict(verbosity = _VERBOSITY) encoder = _createEncoder() trainFile = findDataset("extra/gym/gym.csv") dataSource = FileRecordStream(streamID=trainFile) dataSource.setAutoRewind(True) # Create all the stuff we need to configure the CLARegion g_claConfig['spEnable'] = addSP g_claConfig['tpEnable'] = addTP claParams = _getCLAParams(encoder = encoder, config= g_claConfig) claParams['spSeed'] = g_claConfig['spSeed'] claParams['tpSeed'] = g_claConfig['tpSeed'] # ========================================================================== # Now create the network itself n = Network() n.addRegion("sensor", "py.RecordSensor", json.dumps(sensorParams)) sensor = n.regions['sensor'].getSelf() sensor.encoder = encoder sensor.dataSource = dataSource n.addRegion("level1", "py.CLARegion", json.dumps(claParams)) n.link("sensor", "level1", "UniformLink", "") n.link("sensor", "level1", "UniformLink", "", srcOutput="resetOut", destInput="resetIn") return n
def runHotgym(): model = createModel() model.enableInference({'predictedField': 'consumption'}) metricsManager = MetricsManager(_METRIC_SPECS, model.getFieldInfo(), model.getInferenceType()) with open(findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() reader.next() reader.next() for i, record in enumerate(reader, start=1): modelInput = dict(zip(headers, record)) modelInput["consumption"] = float(modelInput["consumption"]) modelInput["timestamp"] = datetime.datetime.strptime( modelInput["timestamp"], "%m/%d/%y %H:%M") result = model.run(modelInput) result.metrics = metricsManager.update(result) isLast = i == _NUM_RECORDS if i % 100 == 0 or isLast: _LOGGER.info( "After %i records, 1-step altMAPE=%f", i, result.metrics["multiStepBestPredictions:multiStep:" "errorMetric='altMAPE':steps=1:window=1000:" "field=consumption"]) if isLast: break
def runHotgymAnomaly(): model = createModel() model.enableInference({'predictedField': 'consumption'}) with open(findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) csvWriter = csv.writer(open(_OUTPUT_PATH, "wb")) csvWriter.writerow(["timestamp", "consumption", "anomaly_score"]) headers = reader.next() reader.next() reader.next() for i, record in enumerate(reader, start=1): modelInput = dict(zip(headers, record)) modelInput["consumption"] = float(modelInput["consumption"]) modelInput["timestamp"] = datetime.datetime.strptime( modelInput["timestamp"], "%m/%d/%y %H:%M") result = model.run(modelInput) anomalyScore = result.inferences['anomalyScore'] csvWriter.writerow([ modelInput["timestamp"], modelInput["consumption"], anomalyScore ]) if anomalyScore > _ANOMALY_THRESHOLD: _LOGGER.info("Anomaly detected at [%s]. Anomaly score: %f.", result.rawInput["timestamp"], anomalyScore) print "Anomaly scores have been written to", _OUTPUT_PATH
def runHotgym(): model = createModel() model.enableInference({"predictionSteps": [1, 5], "predictedField": "consumption", "numRecords": 4000}) print findDataset(DATA_PATH) with open(findDataset(DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() print headers print reader.next() print reader.next() for record in reader: print record modelInput = dict(zip(headers, record)) modelInput["consumption"] = float(modelInput["consumption"]) modelInput["timestamp"] = datetime.datetime.strptime(modelInput["timestamp"], "%Y-%m-%d %H:%M:%S.%f") result = model.run(modelInput) print result
def _createOPFNetwork(addSP=True, addTP=False): """Create a 'new-style' network ala OPF and return it. If addSP is true, an SPRegion will be added named 'level1SP'. If addTP is true, a TPRegion will be added named 'level1TP' """ # ========================================================================== # Create the encoder and data source stuff we need to configure the sensor sensorParams = dict(verbosity=_VERBOSITY) encoder = _createEncoder() trainFile = findDataset("extra/gym/gym.csv") dataSource = FileRecordStream(streamID=trainFile) dataSource.setAutoRewind(True) # ========================================================================== # Now create the network itself n = Network() n.addRegion("sensor", "py.RecordSensor", json.dumps(sensorParams)) sensor = n.regions["sensor"].getSelf() sensor.encoder = encoder sensor.dataSource = dataSource # ========================================================================== # Add the SP if requested if addSP: print "Adding SPRegion" g_spRegionConfig["inputWidth"] = encoder.getWidth() n.addRegion("level1SP", "py.SPRegion", json.dumps(g_spRegionConfig)) n.link("sensor", "level1SP", "UniformLink", "") n.link("sensor", "level1SP", "UniformLink", "", srcOutput="resetOut", destInput="resetIn") n.link("level1SP", "sensor", "UniformLink", "", srcOutput="spatialTopDownOut", destInput="spatialTopDownIn") n.link("level1SP", "sensor", "UniformLink", "", srcOutput="temporalTopDownOut", destInput="temporalTopDownIn") # ========================================================================== if addTP and addSP: # Add the TP on top of SP if requested # The input width of the TP is set to the column count of the SP print "Adding TPRegion on top of SP" g_tpRegionConfig["inputWidth"] = g_spRegionConfig["columnCount"] n.addRegion("level1TP", "py.TPRegion", json.dumps(g_tpRegionConfig)) n.link("level1SP", "level1TP", "UniformLink", "") n.link("level1TP", "level1SP", "UniformLink", "", srcOutput="topDownOut", destInput="topDownIn") n.link("sensor", "level1TP", "UniformLink", "", srcOutput="resetOut", destInput="resetIn") elif addTP: # Add a lone TPRegion if requested # The input width of the TP is set to the encoder width print "Adding TPRegion" g_tpRegionConfig["inputWidth"] = encoder.getWidth() n.addRegion("level1TP", "py.TPRegion", json.dumps(g_tpRegionConfig)) n.link("sensor", "level1TP", "UniformLink", "") n.link("sensor", "level1TP", "UniformLink", "", srcOutput="resetOut", destInput="resetIn") return n
def _openStream(self, dataUrl, isBlocking, maxTimeout, bookmark, firstRecordIdx): """Open the underlying file stream. This only supports 'file://' prefixed paths. """ self._recordStoreName = findDataset(dataUrl[len(FILE_PREF) :]) self._recordStore = FileRecordStream( streamID=self._recordStoreName, write=False, bookmark=bookmark, firstRecord=firstRecordIdx )
def runGeospatialAnomaly(dataPath, outputPath): model = createModel() with open (findDataset(dataPath)) as fin: reader = csv.reader(fin) csvWriter = csv.writer(open(outputPath,"wb")) csvWriter.writerow(["timestamp", "longitude", "latitude", "speed", "anomaly_score", "new_sequence"]) reader.next() reader.next() reader.next() lastTimestamp = None for _, record in enumerate(reader, start=1): timestamp = datetime.datetime.fromtimestamp(int(record[1]) / 1e3) longitude = float(record[2]) latitude = float(record[3]) speed = float(record[5]) accuracy = float(record[7]) if accuracy > ACCURACY_THRESHOLD: continue newSequence = False if lastTimestamp and ( (timestamp - lastTimestamp).total_seconds() > INTERVAL_THRESHOLD): newSequence = True lastTimestamp = timestamp if newSequence: print "Starting new sequence..." model.resetSequenceStates() modelInput = { "vector": (longitude, latitude, speed) } result = model.run(modelInput) anomalyScore = result.inferences['anomalyScore'] csvWriter.writerow([timestamp, longitude, latitude, speed, anomalyScore, 1 if newSequence else 0]) print "[{0}] - Anomaly score: {1}.".format(timestamp, anomalyScore) print "Anomaly scores have been written to {0}".format(outputPath)
def _openStream(self, dataUrl, isBlocking, maxTimeout, bookmark, firstRecordIdx): """Open the underlying file stream. This only supports 'file://' prefixed paths. """ self._recordStoreName = findDataset(dataUrl[len(FILE_PREF):]) self._recordStore = FileRecordStream(streamID=self._recordStoreName, write=False, bookmark=bookmark, firstRecord=firstRecordIdx)
def testFindDataset(self): # Test non-existing dataset (relative path) with self.assertRaises(Exception): findDataset('no_such_dataset.csv') # Test non-existing dataset (absolute path) with self.assertRaises(Exception): findDataset('/no_such_dataset.csv') # Test existing dataset (relative path) if not os.path.isdir('data'): os.makedirs('data') datasetPath = 'test_find_dataset.csv' filename = 'data/test_find_dataset.csv' # This is the uncompressed name. fullPath = os.path.abspath(filename) if os.path.exists(fullPath): os.remove(fullPath) fullPathCompressed = fullPath + ".gz" if os.path.exists(fullPathCompressed): os.remove(fullPathCompressed) # Create the "dataset" open(filename, 'w').write('123') path = findDataset(datasetPath) self.assertEqual(path, fullPath) self.assertTrue(os.path.exists(path)) # This should do nothing, since it is already compressed path = uncompressAndCopyDataset(path) self.assertEqual(path, fullPath) # Test existing dataset (absolute path) self.assertEqual(findDataset(fullPath), fullPath) # Test existing dataset (compressed path) # Create the compressed file import gzip f = gzip.GzipFile(fullPathCompressed, 'w') f.write("1,2,3\n") f.close() self.assertTrue(os.path.isfile(fullPathCompressed)) # Remove the original file os.remove(fullPath) self.assertEqual(findDataset(datasetPath), fullPathCompressed) # This should put the uncompressed file in the same directory path = uncompressAndCopyDataset(fullPathCompressed) self.assertEqual(path, fullPath) self.assertTrue(os.path.isfile(path)) os.remove(fullPath) os.remove(fullPathCompressed)
def runDemo(): trainFile = findDataset(_INPUT_FILE_PATH) dataSource = FileRecordStream(streamID=trainFile) numRecords = dataSource.getDataRowCount() print "Creating network" network = createNetwork(dataSource) outputPath = os.path.join(os.path.dirname(__file__), _OUTPUT_FILE_NAME) with open(outputPath, "w") as outputFile: writer = csv.writer(outputFile) print "Running network" print "Writing output to: %s" % outputPath runNetwork(network, numRecords, writer) print "Hierarchy demo finished"
def runHotgym(): model = createModel() model.enableInference({ 'predictionSteps': [1, 5], 'predictedField': 'consumption', 'numRecords': 4000 }) print findDataset(DATA_PATH) with open(findDataset(DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() print headers print reader.next() print reader.next() for record in reader: print record modelInput = dict(zip(headers, record)) modelInput["consumption"] = float(modelInput["consumption"]) modelInput["timestamp"] = datetime.datetime.strptime( modelInput["timestamp"], "%Y-%m-%d %H:%M:%S.%f") result = model.run(modelInput) print result
def test_GymAggregateWithOldData(self): filename = findDataset('extra/gym/gym.csv') input = [] gymFields = None with FileRecordStream(filename) as f: gymFields = f.getFields() for i in range(10): input.append(f.getNextRecord()) #Append the records from the beginning to the end of the dataset input.extend(input[0:3]) for h in (1,3): aggregationOptions = dict( fields=[ ('timestamp', lambda x: x[0],), ('attendeeCount', sum), ('consumption', sum)], hours=h ) handle = \ tempfile.NamedTemporaryFile(prefix='test', suffix='.bin') outputFile = handle.name handle.close() dataInput = DataInputList(input, gymFields) dataOutput = DataOutputList(None) _aggregate(input=dataInput, options=aggregationOptions, timeFieldName='timestamp', output=dataOutput) dataOutput.close() outputRecords = dataOutput._store timeFieldIdx = [f[0] for f in gymFields].index('timestamp') diffs = [] for i in range(1,len(outputRecords)): diffs.append(outputRecords[i][timeFieldIdx] - \ outputRecords[i-1][timeFieldIdx]) positiveTimeFlow = map((lambda x: x < datetime.timedelta(seconds=0)), diffs) #Make sure that old records are in the aggregated output and at the same #time make sure that they are in consecutive order after being inserted self.assertEquals(sum(positiveTimeFlow), 1) return
def test_GymAggregateWithOldData(self): filename = findDataset('extra/gym/gym.csv') input = [] gymFields = None with FileRecordStream(filename) as f: gymFields = f.getFields() for i in range(10): input.append(f.getNextRecord()) #Append the records from the beginning to the end of the dataset input.extend(input[0:3]) for h in (1, 3): aggregationOptions = dict(fields=[( 'timestamp', lambda x: x[0], ), ('attendeeCount', sum), ('consumption', sum)], hours=h) handle = \ tempfile.NamedTemporaryFile(prefix='test', suffix='.bin') outputFile = handle.name handle.close() dataInput = DataInputList(input, gymFields) dataOutput = DataOutputList(None) _aggregate(input=dataInput, options=aggregationOptions, timeFieldName='timestamp', output=dataOutput) dataOutput.close() outputRecords = dataOutput._store timeFieldIdx = [f[0] for f in gymFields].index('timestamp') diffs = [] for i in range(1, len(outputRecords)): diffs.append(outputRecords[i][timeFieldIdx] - \ outputRecords[i-1][timeFieldIdx]) positiveTimeFlow = map( (lambda x: x < datetime.timedelta(seconds=0)), diffs) #Make sure that old records are in the aggregated output and at the same #time make sure that they are in consecutive order after being inserted self.assertEquals(sum(positiveTimeFlow), 1) return
def trainAndClassify(trainingSetSize, model, data_path, results_path): """ In this function we explicitly label specific portions of the data stream. Any later record that matches the pattern will get labeled the same way. """ model.enableInference({'predictedField': 'y'}) # Here we will get the classifier instance so we can add and query labels. classifierRegion = model._getAnomalyClassifier() classifierRegionPy = classifierRegion.getSelf() # We need to set this classification type. It is supposed to be the default # but is not for some reason. classifierRegionPy.classificationVectorType = 2 with open (findDataset(data_path)) as fin: reader = csv.reader(fin) headers = reader.next() csvWriter = csv.writer(open(results_path,"wb")) csvWriter.writerow(["x", "y", "trueLabel", "anomalyScore", "predictedLabel"]) for x, record in enumerate(reader): modelInput = dict(zip(headers, record)) modelInput["y"] = float(modelInput["y"]) trueLabel = modelInput["label"] result = model.run(modelInput) anomalyScore = result.inferences['anomalyScore'] predictedLabel = result.inferences['anomalyLabel'] if predictedLabel == "[]": predictedLabel = 'label0' else: predictedLabel = result.inferences['anomalyLabel'][2:-2] # relabel prediction for all the records with indices withing the training set size range. if x < trainingSetSize: for label in CLASS_RANGES: for class_range in CLASS_RANGES[label]: start = class_range['start'] end = class_range['end'] if start <= x <= end: predictedLabel = label if x == end + 2: print "Adding labeled anomalies for record", x classifierRegion.executeCommand(["addLabel", str(start), str(end + 1), label]) csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, predictedLabel]) print "Results scores have been written to %s" % results_path
def trainAndClassify(model, data_path, results_path): """ In this function we explicitly label specific portions of the data stream. Any later record that matches the pattern will get labeled the same way. """ model.enableInference({'predictedField': 'label'}) model.enableLearning() # Here we will get the classifier instance so we can add and query labels. classifierRegion = model._getAnomalyClassifier() classifierRegionPy = classifierRegion.getSelf() # We need to set this classification type. It is supposed to be the default # but is not for some reason. classifierRegionPy.classificationVectorType = 2 with open (findDataset(data_path)) as fin: reader = csv.reader(fin) headers = reader.next() #skip the 2 first row reader.next() reader.next() csvWriter = csv.writer(open(results_path,"wb")) csvWriter.writerow(["x", "y", "trueLabel", "anomalyScore", "predictedLabel"]) for x, record in enumerate(reader): modelInput = dict(zip(headers, record)) modelInput["y"] = float(modelInput["y"]) trueLabel = modelInput["label"] result = model.run(modelInput) anomalyScore = result.inferences['anomalyScore'] predictedLabel = result.inferences['anomalyLabel'][2:-2] if x < SP_TRAINING_SET_SIZE: # wait until the SP has seen the 3 classes at lest one time csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, 'SP_TRAINING']) elif SP_TRAINING_SET_SIZE <= x < TM_TRAINING_SET_SIZE: csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, 'TM_TRAINING']) elif TM_TRAINING_SET_SIZE <= x < CLASSIFIER_TRAINING_SET_SIZE: # relabel predictions (i.e. train classifier) csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, 'CLASSIFIER_TRAINING']) classifierRegion.executeCommand(["addLabel", str(x), str(x + 1), trueLabel]) elif x>= CLASSIFIER_TRAINING_SET_SIZE: # predict csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, predictedLabel]) print "Results have been written to %s" % results_path
def runHotgymAnomaly(): model = createModel() model.enableInference({"predictedField": "consumption"}) with open(findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() reader.next() reader.next() for i, record in enumerate(reader, start=1): modelInput = dict(zip(headers, record)) modelInput["consumption"] = float(modelInput["consumption"]) modelInput["timestamp"] = datetime.datetime.strptime(modelInput["timestamp"], "%m/%d/%y %H:%M") result = model.run(modelInput) anomalyScore = result.inferences["anomalyScore"] if anomalyScore > _ANOMALY_THRESHOLD: _LOGGER.info("Anomaly detected at [%s]. Anomaly score: %f.", result.rawInput["timestamp"], anomalyScore)
def testSaveAndReload(self): """ This function tests saving and loading. It will train a network for 500 iterations, then save it and reload it as a second network instance. It will then run both networks for 100 iterations and ensure they return identical results. """ print "Creating network..." netOPF = _createOPFNetwork() level1OPF = netOPF.regions['level1SP'] # ========================================================================== print "Training network for 500 iterations" level1OPF.setParameter('learningMode', 1) level1OPF.setParameter('inferenceMode', 0) netOPF.run(500) level1OPF.setParameter('learningMode', 0) level1OPF.setParameter('inferenceMode', 1) # ========================================================================== # Save network and reload as a second instance. We need to reset the data # source for the unsaved network so that both instances start at the same # place print "Saving and reload network" _, tmpNetworkFilename = _setupTempDirectory("trained.nta") netOPF.save(tmpNetworkFilename) netOPF2 = Network(tmpNetworkFilename) level1OPF2 = netOPF2.regions['level1SP'] sensor = netOPF.regions['sensor'].getSelf() trainFile = findDataset("extra/gym/gym.csv") sensor.dataSource = FileRecordStream(streamID=trainFile) sensor.dataSource.setAutoRewind(True) # ========================================================================== print "Running inference on the two networks for 100 iterations" for _ in xrange(100): netOPF2.run(1) netOPF.run(1) l1outputOPF2 = level1OPF2.getOutputData("bottomUpOut") l1outputOPF = level1OPF.getOutputData("bottomUpOut") opfHash2 = l1outputOPF2.nonzero()[0].sum() opfHash = l1outputOPF.nonzero()[0].sum() self.assertEqual(opfHash2, opfHash)
def testSaveAndReload(self): """ This function tests saving and loading. It will train a network for 500 iterations, then save it and reload it as a second network instance. It will then run both networks for 100 iterations and ensure they return identical results. """ print "Creating network..." netOPF = _createOPFNetwork() level1OPF = netOPF.regions["level1SP"] # ========================================================================== print "Training network for 500 iterations" level1OPF.setParameter("learningMode", 1) level1OPF.setParameter("inferenceMode", 0) netOPF.run(500) level1OPF.setParameter("learningMode", 0) level1OPF.setParameter("inferenceMode", 1) # ========================================================================== # Save network and reload as a second instance. We need to reset the data # source for the unsaved network so that both instances start at the same # place print "Saving and reload network" _, tmpNetworkFilename = _setupTempDirectory("trained.nta") netOPF.save(tmpNetworkFilename) netOPF2 = Network(tmpNetworkFilename) level1OPF2 = netOPF2.regions["level1SP"] sensor = netOPF.regions["sensor"].getSelf() trainFile = findDataset("extra/gym/gym.csv") sensor.dataSource = FileRecordStream(streamID=trainFile) sensor.dataSource.setAutoRewind(True) # ========================================================================== print "Running inference on the two networks for 100 iterations" for _ in xrange(100): netOPF2.run(1) netOPF.run(1) l1outputOPF2 = level1OPF2.getOutputData("bottomUpOut") l1outputOPF = level1OPF.getOutputData("bottomUpOut") opfHash2 = l1outputOPF2.nonzero()[0].sum() opfHash = l1outputOPF.nonzero()[0].sum() self.assertEqual(opfHash2, opfHash)
def classifyAnomaliesAutomatically(): """ In this function we use the automatic labeling feature. Here we can set an anomaly threshold. Any record whose anomaly score goes above the threshold is automatically sent to the classifier. Any later record that matches the pattern will get labeled as "Auto Threshold Classification (auto)" """ model = createModel() model.enableInference({'predictedField': 'sinx'}) # Setup the classifier to automatically classify records with # anomaly score >= 0.9 classifierRegion = model._getAnomalyClassifier() classifierRegion.setParameter('anomalyThreshold',0.9) print "threshold for classifying anomalies is:", ( classifierRegion.getParameter('anomalyThreshold')) with open (findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() csvWriter = csv.writer(open(_OUTPUT_PATH,"wb")) csvWriter.writerow(["x", "sinx", "anomaly_score", "anomalyLabel"]) for i, record in enumerate(reader, start=1): modelInput = dict(zip(headers, record)) modelInput["sinx"] = float(modelInput["sinx"]) result = model.run(modelInput) anomalyScore = result.inferences['anomalyScore'] anomalyLabel = result.inferences['anomalyLabel'] # Convert the anomaly label into either 0 or 1 if anomalyLabel == "[]": anomalyLabel = 0 elif anomalyLabel == "['Auto Threshold Classification']": anomalyLabel = 1.0 elif anomalyLabel == "['Auto Threshold Classification (auto)']": anomalyLabel = 1.0 csvWriter.writerow([i, modelInput["sinx"], anomalyScore, anomalyLabel]) if i>500 and anomalyScore > _ANOMALY_THRESHOLD: print "Anomaly detected at row [%d]. Anomaly score: %f." %(i, anomalyScore) print "Anomaly scores have been written to",_OUTPUT_PATH print "The following labels were stored in the classifier:" labels = eval(classifierRegion.executeCommand(["getLabels"])) pprint.pprint(labels)
def runHotgym(): model = createModel() model.enableInference({'predictedField': 'consumption'}) metricsManager = MetricsManager(METRIC_SPECS, model.getFieldInfo(), model.getInferenceType()) with open(findDataset(DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() reader.next() reader.next() for record in reader: modelInput = dict(zip(headers, record)) modelInput["consumption"] = float(modelInput["consumption"]) modelInput["timestamp"] = datetime.datetime.strptime( modelInput["timestamp"], "%Y-%m-%d %H:%M:%S.%f") result = model.run(modelInput) result.metrics = metricsManager.update(result) print result
def runHotgym(): model = createModel() model.enableInference({'predictedField': 'Volume'}) metricsManager = MetricsManager(_METRIC_SPECS, model.getFieldInfo(), model.getInferenceType()) with open (findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() for i, record in enumerate(reader, start=1): modelInput = dict(zip(headers, record)) modelInput["Volume"] = float(modelInput["Volume"]) result = model.run(modelInput) result.metrics = metricsManager.update(result) isLast = i == _NUM_RECORDS if i % 100 == 0 or isLast: print result.metrics if isLast: break
def computeClassificationAccuracy(resultFile, trainingSetSize): numErrors = 0.0 numRecords = 0.0 with open (findDataset(resultFile)) as fin: reader = csv.reader(fin) headers = reader.next() for i, record in enumerate(reader): if numRecords >= trainingSetSize: data = dict(zip(headers, record)) if data['predictedLabel'] != data['trueLabel']: numErrors += 1.0 print "=> Incorrectly predicted record at line %s." %i print " True Label: %s. Predicted Label: %s" %(data['trueLabel'], data['predictedLabel']) numRecords += 1.0 # classification accuracy return (1-numErrors/numRecords) * 100
def train(): model.enableInference({'predictedField': 'hostname'}) with open (findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() # Skip header lines reader.next() reader.next() i = 0 for i, record in enumerate(reader, start=1): modelInput = dict(zip(headers, record)) model.run(modelInput) i += 1 if (i%500 == 0): print("ran "+ str(i) + " steps") model.save(os.path.join(_OUTPUT_PATH, "checkpoint"))
def computeClassificationAccuracy(resultFile): numErrors = 0.0 numTestRecords = 0.0 with open(findDataset(resultFile)) as fin: reader = csv.reader(fin) headers = reader.next() for i, record in enumerate(reader): if i >= CLASSIFIER_TRAINING_SET_SIZE: data = dict(zip(headers, record)) if data['predictedLabel'] != data['trueLabel']: numErrors += 1.0 print "=> Incorrectly predicted record at line %s." % i print " True Label: %s. Predicted Label: %s" % (data['trueLabel'], data['predictedLabel']) numTestRecords += 1.0 # classification accuracy return 100 * (1 - numErrors / numTestRecords)
def test_GymAggregate(self): filename = findDataset('extra/gym/gym.csv') input = [] gymFields = None with FileRecordStream(filename) as f: gymFields = f.getFields() for i in range(10): input.append(f.getNextRecord()) for h in (1, 3): aggregationOptions = dict(fields=[( 'timestamp', lambda x: x[0], ), ('attendeeCount', sum), ('consumption', sum)], hours=h) handle = \ tempfile.NamedTemporaryFile(prefix='test', suffix='.bin') outputFile = handle.name handle.close() dataInput = DataInputList(input, gymFields) dataOutput = DataOutputMyFile( FileRecordStream(outputFile, write=True, fields=gymFields)) _aggregate(input=dataInput, options=aggregationOptions, timeFieldName='timestamp', output=dataOutput) dataOutput.close() for r in FileRecordStream(outputFile): print r print '-' * 30 return
def test_GymAggregate(self): filename = findDataset('extra/gym/gym.csv') input = [] gymFields = None with FileRecordStream(filename) as f: gymFields = f.getFields() for i in range(10): input.append(f.getNextRecord()) for h in (1,3): aggregationOptions = dict( fields=[ ('timestamp', lambda x: x[0],), ('attendeeCount', sum), ('consumption', sum)], hours=h ) handle = \ tempfile.NamedTemporaryFile(prefix='test', suffix='.bin') outputFile = handle.name handle.close() dataInput = DataInputList(input, gymFields) dataOutput = DataOutputMyFile(FileRecordStream(outputFile, write=True, fields=gymFields)) _aggregate(input=dataInput, options=aggregationOptions, timeFieldName='timestamp', output=dataOutput) dataOutput.close() for r in FileRecordStream(outputFile): print r print '-' * 30 return
def trainAndClassify(trainingSetSize, model, data_path, results_path): """ In this function we explicitly label specific portions of the data stream. Any later record that matches the pattern will get labeled the same way. """ model.enableInference({'predictedField': 'y'}) # Here we will get the classifier instance so we can add and query labels. classifierRegion = model._getAnomalyClassifier() classifierRegionPy = classifierRegion.getSelf() # We need to set this classification type. It is supposed to be the default # but is not for some reason. classifierRegionPy.classificationVectorType = 2 with open (findDataset(data_path)) as fin: reader = csv.reader(fin) headers = reader.next() csvWriter = csv.writer(open(results_path,"wb")) csvWriter.writerow(["x", "y", "trueLabel", "anomalyScore", "predictedLabel"]) for x, record in enumerate(reader): modelInput = dict(zip(headers, record)) modelInput["y"] = float(modelInput["y"]) trueLabel = modelInput["label"] result = model.run(modelInput) anomalyScore = result.inferences['anomalyScore'] predictedLabel = result.inferences['anomalyLabel'][2:-2] if x < 1000: # wait until the SP has seen the 3 classes at lest one time csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, 'NOT_READY']) elif 1000 <= x < trainingSetSize: # relabel predictions (i.e. train KNN) csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, 'TRAINING']) classifierRegion.executeCommand(["addLabel", str(x), str(x + 1), trueLabel]) elif x>= trainingSetSize: # predict csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, predictedLabel]) print "Results have been written to %s" % results_path
def computeClassificationAccuracy(result_file): false_positive = 0 false_negative = 0 with open (findDataset(result_file)) as fin: reader = csv.reader(fin) headers = reader.next() for i, record in enumerate(reader): data = dict(zip(headers, record)) if data['predictedLabel'] == "label1" and data['trueLabel'] != "label1": false_positive +=1 #print "False positive: %s, %s, %s" % (i, data['anomaly'], data['anomalyLabel']) if data['predictedLabel'] == "label0" and data['trueLabel'] != "label0": false_negative +=1 #print "False negative: %s, %s, %s" % (i, data['anomaly'], data['anomalyLabel']) print "" print "== Classification accuracy for %s ==" % resultsPath print "* False positive: %s" % false_positive print "* False negative: %s" % false_negative print ""
def runHotgymAnomaly(): model = createModel() model.enableInference({'predictedField': 'consumption'}) with open (findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) csvWriter = csv.writer(open(_OUTPUT_PATH,"wb")) csvWriter.writerow(["timestamp", "consumption", "anomaly_score"]) headers = reader.next() reader.next() reader.next() for i, record in enumerate(reader, start=1): modelInput = dict(zip(headers, record)) modelInput["consumption"] = float(modelInput["consumption"]) modelInput["timestamp"] = datetime.datetime.strptime( modelInput["timestamp"], "%m/%d/%y %H:%M") result = model.run(modelInput) anomalyScore = result.inferences['anomalyScore'] csvWriter.writerow([modelInput["timestamp"], modelInput["consumption"], anomalyScore]) if anomalyScore > _ANOMALY_THRESHOLD: _LOGGER.info("Anomaly detected at [%s]. Anomaly score: %f.", result.rawInput["timestamp"], anomalyScore) print "Anomaly scores have been written to",_OUTPUT_PATH
def runHotgym(): models = [] previousPredictions = [] bestPrediction = 0.0 lstPrediction = 0.0 # Setup all the models. Here each model has a different SP seed. for i in range(14): models.append(createModel(1956 + i)) for m in models: setupModel(m) previousPredictions.append(0.0) # The best least squares predictor. Initialize with 1.0/len(models) bestFit = numpy.ones(len(models)) / len(models) print "Running ensemble with", len( models), "models. This could take a while!" # Matrix to hold the last month's worth of predictions and actuals lstNumRows = 2000 a = numpy.zeros((lstNumRows, len(models))) b = numpy.zeros(lstNumRows) with open("output.csv", "wb") as outputFile: csvWriter = csv.writer(outputFile) csvWriter.writerow(["timestamp", "consumption", "predictions"]) with open(findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() reader.next() reader.next() for i, record in enumerate(reader, start=1): # Prepare input dict for feeding each model modelInput = dict(zip(headers, record)) modelInput["consumption"] = float(modelInput["consumption"]) modelInput["timestamp"] = datetime.datetime.strptime( modelInput["timestamp"], "%m/%d/%y %H:%M") # Run each model and get each prediction and running sum results = [] predictions = [] sum = 0 lstSum = 0 for k, m in enumerate(models): result = m.run(modelInput) prediction = result.inferences["multiStepBestPredictions"][ 1] results.append(result) predictions.append(prediction) sum += prediction lstSum += bestFit[k] * prediction # # Write results to the output CSV file if i > 1: row = [modelInput["timestamp"], modelInput["consumption"]] row.extend(previousPredictions) row.append(bestPrediction) row.append(lstPrediction) csvWriter.writerow(row) # Keep a rolling store of the last lstNumRows of predictions and # actuals a[i % lstNumRows] = previousPredictions b[i % lstNumRows] = modelInput["consumption"] # Redo the least squares estimate on the last lstNumRows every week if (i > 300 + lstNumRows) and (i % 24 * 7 == 0): print "Iteration: %d, doing least squares fit using " \ "the last %d predictions!" % (i,lstNumRows) x = numpy.linalg.lstsq(a, b) bestFit = x[0] # Print the weights and the average residual squared error print bestFit, x[1][0] / lstNumRows # Compute best prediction (to be used next timestamp) # Save current predictions for later output. This shifts the # predictions so that they are lined up with the timestamps they are # actually predicting. previousPredictions = copy.deepcopy(predictions) bestPrediction = sum / len(models) lstPrediction = lstSum if i % 200 == 0: print "iteration:", i if i == _NUM_RECORDS: break
def _createOPFNetwork(addSP=True, addTP=False): """Create a 'new-style' network ala OPF and return it. If addSP is true, an SPRegion will be added named 'level1SP'. If addTP is true, a TPRegion will be added named 'level1TP' """ # ========================================================================== # Create the encoder and data source stuff we need to configure the sensor sensorParams = dict(verbosity=_VERBOSITY) encoder = _createEncoder() trainFile = findDataset("extra/gym/gym.csv") dataSource = FileRecordStream(streamID=trainFile) dataSource.setAutoRewind(True) # ========================================================================== # Now create the network itself n = Network() n.addRegion("sensor", "py.RecordSensor", json.dumps(sensorParams)) sensor = n.regions['sensor'].getSelf() sensor.encoder = encoder sensor.dataSource = dataSource # ========================================================================== # Add the SP if requested if addSP: print "Adding SPRegion" g_spRegionConfig['inputWidth'] = encoder.getWidth() n.addRegion("level1SP", "py.SPRegion", json.dumps(g_spRegionConfig)) n.link("sensor", "level1SP", "UniformLink", "") n.link("sensor", "level1SP", "UniformLink", "", srcOutput="resetOut", destInput="resetIn") n.link("level1SP", "sensor", "UniformLink", "", srcOutput="spatialTopDownOut", destInput="spatialTopDownIn") n.link("level1SP", "sensor", "UniformLink", "", srcOutput="temporalTopDownOut", destInput="temporalTopDownIn") # ========================================================================== if addTP and addSP: # Add the TP on top of SP if requested # The input width of the TP is set to the column count of the SP print "Adding TPRegion on top of SP" g_tpRegionConfig['inputWidth'] = g_spRegionConfig['columnCount'] n.addRegion("level1TP", "py.TPRegion", json.dumps(g_tpRegionConfig)) n.link("level1SP", "level1TP", "UniformLink", "") n.link("level1TP", "level1SP", "UniformLink", "", srcOutput="topDownOut", destInput="topDownIn") n.link("sensor", "level1TP", "UniformLink", "", srcOutput="resetOut", destInput="resetIn") elif addTP: # Add a lone TPRegion if requested # The input width of the TP is set to the encoder width print "Adding TPRegion" g_tpRegionConfig['inputWidth'] = encoder.getWidth() n.addRegion("level1TP", "py.TPRegion", json.dumps(g_tpRegionConfig)) n.link("sensor", "level1TP", "UniformLink", "") n.link("sensor", "level1TP", "UniformLink", "", srcOutput="resetOut", destInput="resetIn") return n
def testDeltaFilter(self): """ data looks like: should generate deltas "t" "s" "dt" "ds" t 10 X t+1s 20 1s 10 t+1d 50 86399 30 r t+1d+1s 60 X r+1d+3s 65 2s 5 """ r = RecordSensor() filename = findDataset("extra/qa/delta.csv") datasource = FileRecordStream(filename) r.dataSource = datasource n = 50 encoder = MultiEncoder({'blah': dict(fieldname="s", type='ScalarEncoder', n=n, w=11, minval=0, maxval=100)}) r.encoder = encoder # Test #1 -- no deltas # Make sure we get a reset when the gym changes resetOut = numpy.zeros((1,), dtype='float') sequenceIdOut = numpy.zeros((1,), dtype='float') dataOut = numpy.zeros((n,), dtype='float') sourceOut = numpy.zeros((1,), dtype='float') categoryOut = numpy.zeros((1,), dtype='float') outputs = dict(resetOut=resetOut, sourceOut = sourceOut, sequenceIdOut = sequenceIdOut, dataOut = dataOut, categoryOut = categoryOut) inputs = dict() r.verbosity=0 r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 10) self.assertEqual(lr['_reset'], 1) self.assertTrue('dt' not in lr) self.assertTrue('ds' not in lr) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 20) self.assertEqual(lr['_reset'], 0) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 50) self.assertEqual(lr['_reset'], 0) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 60) self.assertEqual(lr['_reset'], 1) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3)) self.assertEqual(lr['s'], 65) self.assertEqual(lr['_reset'], 0) # Add filters r.preEncodingFilters = [DeltaFilter("s", "ds"), DeltaFilter("t", "dt")] r.rewind() # skip first record, which has a reset r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 20) self.assertEqual(lr['_reset'], 1) # this record should have a reset since # it is first of a sequence self.assertEqual(lr['dt'], 1) self.assertEqual(lr['ds'], 10) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 50) self.assertEqual(lr['_reset'], 0) self.assertEqual(lr['dt'], 3600 * 24 - 1) self.assertEqual(lr['ds'], 30) # next reset record is skipped r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3)) self.assertEqual(lr['s'], 65) self.assertEqual(lr['_reset'], 1) self.assertEqual(lr['dt'], 2) self.assertEqual(lr['ds'], 5)
def generateStats(filename, maxSamples = None,): """ Collect statistics for each of the fields in the user input data file and return a stats dict object. Parameters: ------------------------------------------------------------------------------ filename: The path and name of the data file. maxSamples: Upper bound on the number of rows to be processed retval: A dictionary of dictionaries. The top level keys are the field names and the corresponding values are the statistics collected for the individual file. Example: { 'consumption':{'min':0,'max':90,'mean':50,...}, 'gym':{'numDistinctCategories':10,...}, ... } """ # Mapping from field type to stats collector object statsCollectorMapping = {'float': FloatStatsCollector, 'int': IntStatsCollector, 'string': StringStatsCollector, 'datetime': DateTimeStatsCollector, 'bool': BoolStatsCollector, } filename = findDataset(filename) print "*"*40 print "Collecting statistics for file:'%s'" % (filename,) dataFile = FileRecordStream(filename) # Initialize collector objects # statsCollectors list holds statsCollector objects for each field statsCollectors = [] for fieldName, fieldType, fieldSpecial in dataFile.getFields(): # Find the corresponding stats collector for each field based on field type # and intialize an instance statsCollector = \ statsCollectorMapping[fieldType](fieldName, fieldType, fieldSpecial) statsCollectors.append(statsCollector) # Now collect the stats if maxSamples is None: maxSamples = 500000 for i in xrange(maxSamples): record = dataFile.getNextRecord() if record is None: break for i, value in enumerate(record): statsCollectors[i].addValue(value) # stats dict holds the statistics for each field stats = {} for statsCollector in statsCollectors: statsCollector.getStats(stats) # We don't want to include reset field in permutations # TODO: handle reset field in a clean way if dataFile.getResetFieldIdx() is not None: resetFieldName,_,_ = dataFile.getFields()[dataFile.reset] stats.pop(resetFieldName) if VERBOSITY > 0: pprint.pprint(stats) return stats
def generateDataset(aggregationInfo, inputFilename, outputFilename=None): """Generate a dataset of aggregated values Parameters: ---------------------------------------------------------------------------- aggregationInfo: a dictionary that contains the following entries - fields: a list of pairs. Each pair is a field name and an aggregation function (e.g. sum). The function will be used to aggregate multiple values during the aggregation period. aggregation period: 0 or more of unit=value fields; allowed units are: [years months] | [weeks days hours minutes seconds milliseconds microseconds] NOTE: years and months are mutually-exclusive with the other units. See getEndTime() and _aggregate() for more details. Example1: years=1, months=6, Example2: hours=1, minutes=30, If none of the period fields are specified or if all that are specified have values of 0, then aggregation will be suppressed, and the given inputFile parameter value will be returned. inputFilename: filename (or relative path form NTA_DATA_PATH) of the input dataset outputFilename: name for the output file. If not given, a name will be generated based on the input filename and the aggregation params retval: Name of the generated output file. This will be the same as the input file name if no aggregation needed to be performed If the input file contained a time field, sequence id field or reset field that were not specified in aggregationInfo fields, those fields will be added automatically with the following rules: 1. The order will be R, S, T, rest of the fields 2. The aggregation function for all will be to pick the first: lambda x: x[0] Returns: the path of the aggregated data file if aggregation was performed (in the same directory as the given input file); if aggregation did not need to be performed, then the given inputFile argument value is returned. """ # Create the input stream inputFullPath = findDataset(inputFilename) inputObj = FileRecordStream(inputFullPath) # Instantiate the aggregator aggregator = Aggregator(aggregationInfo=aggregationInfo, inputFields=inputObj.getFields()) # Is it a null aggregation? If so, just return the input file unmodified if aggregator.isNullAggregation(): return inputFullPath # ------------------------------------------------------------------------ # If we were not given an output filename, create one based on the # aggregation settings if outputFilename is None: outputFilename = 'agg_%s' % \ os.path.splitext(os.path.basename(inputFullPath))[0] timePeriods = 'years months weeks days '\ 'hours minutes seconds milliseconds microseconds' for k in timePeriods.split(): if aggregationInfo.get(k, 0) > 0: outputFilename += '_%s_%d' % (k, aggregationInfo[k]) outputFilename += '.csv' outputFilename = os.path.join(os.path.dirname(inputFullPath), outputFilename) # ------------------------------------------------------------------------ # If some other process already started creating this file, simply # wait for it to finish and return without doing anything lockFilePath = outputFilename + '.please_wait' if os.path.isfile(outputFilename) or \ os.path.isfile(lockFilePath): while os.path.isfile(lockFilePath): print 'Waiting for %s to be fully written by another process' % \ lockFilePath time.sleep(1) return outputFilename # Create the lock file lockFD = open(lockFilePath, 'w') # ------------------------------------------------------------------------- # Create the output stream outputObj = FileRecordStream(streamID=outputFilename, write=True, fields=inputObj.getFields()) # ------------------------------------------------------------------------- # Write all aggregated records to the output while True: inRecord = inputObj.getNextRecord() (aggRecord, aggBookmark) = aggregator.next(inRecord, None) if aggRecord is None and inRecord is None: break if aggRecord is not None: outputObj.appendRecord(aggRecord) return outputFilename
def runHotgym(): model = createModel() model.enableInference({'predictedField': 'consumption'}) metricsManager = MetricsManager(_METRIC_SPECS, model.getFieldInfo(), model.getInferenceType()) # The shifter will align prediction and actual values. shifter = InferenceShifter() # Keep the last WINDOW predicted and actual values for plotting. actHistory = deque([0.0] * WINDOW, maxlen=60) predHistory = deque([0.0] * WINDOW, maxlen=60) # Initialize the plot lines that we will update with each new record. actline, = plt.plot(range(WINDOW), actHistory) predline, = plt.plot(range(WINDOW), predHistory) # Set the y-axis range. actline.axes.set_ylim(0, 100) predline.axes.set_ylim(0, 100) with open(findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() reader.next() reader.next() for i, record in enumerate(reader, start=1): modelInput = dict(zip(headers, record)) modelInput["consumption"] = float(modelInput["consumption"]) modelInput["timestamp"] = datetime.datetime.strptime( modelInput["timestamp"], "%m/%d/%y %H:%M") result = model.run(modelInput) shifted_result = shifter.shift(result) # Update the trailing predicted and actual value deques. inference = shifted_result.inferences['multiStepBestPredictions'][ 1] if inference is not None: actHistory.append(shifted_result.rawInput['consumption']) predHistory.append(inference) # Redraw the chart with the new data. actline.set_ydata(actHistory) # update the data predline.set_ydata(predHistory) # update the data plt.draw() plt.legend(('actual', 'predicted')) # Make sure we wait a total of SECONDS_PER_STEP seconds per iteration. try: plt.pause(SECONDS_PER_STEP) except: pass result.metrics = metricsManager.update(result) isLast = i == _NUM_RECORDS if i % 100 == 0 or isLast: _LOGGER.info( "After %i records, 1-step altMAPE=%f", i, result.metrics["multiStepBestPredictions:multiStep:" "errorMetric='altMAPE':steps=1:window=1000:" "field=consumption"]) if isLast: break
def findAllDatasets(datasets): """Find all datasets in a dataset dictionary""" d = dict() for key in datasets: d[key] = findDataset(datasets[key]) return d
def testDeltaFilter(self): """ data looks like: should generate deltas "t" "s" "dt" "ds" t 10 X t+1s 20 1s 10 t+1d 50 86399 30 r t+1d+1s 60 X r+1d+3s 65 2s 5 """ r = RecordSensor() filename = findDataset("extra/qa/delta.csv") datasource = FileRecordStream(filename) r.dataSource = datasource n = 50 encoder = MultiEncoder({ 'blah': dict(fieldname="s", type='ScalarEncoder', n=n, w=11, minval=0, maxval=100) }) r.encoder = encoder # Test #1 -- no deltas # Make sure we get a reset when the gym changes resetOut = numpy.zeros((1, ), dtype='float') sequenceIdOut = numpy.zeros((1, ), dtype='float') dataOut = numpy.zeros((n, ), dtype='float') sourceOut = numpy.zeros((1, ), dtype='float') categoryOut = numpy.zeros((1, ), dtype='float') outputs = dict(resetOut=resetOut, sourceOut=sourceOut, sequenceIdOut=sequenceIdOut, dataOut=dataOut, categoryOut=categoryOut) inputs = dict() r.verbosity = 0 r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 10) self.assertEqual(lr['_reset'], 1) self.assertTrue('dt' not in lr) self.assertTrue('ds' not in lr) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 20) self.assertEqual(lr['_reset'], 0) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 50) self.assertEqual(lr['_reset'], 0) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 60) self.assertEqual(lr['_reset'], 1) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3)) self.assertEqual(lr['s'], 65) self.assertEqual(lr['_reset'], 0) # Add filters r.preEncodingFilters = [DeltaFilter("s", "ds"), DeltaFilter("t", "dt")] r.rewind() # skip first record, which has a reset r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 20) self.assertEqual(lr['_reset'], 1) # this record should have a reset since # it is first of a sequence self.assertEqual(lr['dt'], 1) self.assertEqual(lr['ds'], 10) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 50) self.assertEqual(lr['_reset'], 0) self.assertEqual(lr['dt'], 3600 * 24 - 1) self.assertEqual(lr['ds'], 30) # next reset record is skipped r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3)) self.assertEqual(lr['s'], 65) self.assertEqual(lr['_reset'], 1) self.assertEqual(lr['dt'], 2) self.assertEqual(lr['ds'], 5)
def classifyAnomaliesManually(): """ In this function we explicitly label specific portions of the data stream that we happen to know are anomalous. Any later record that matches the pattern will get labeled as "myAnomaly" """ model = createModel() model.enableInference({'predictedField': 'sinx'}) # Here we will get the classifier instance so we can add and query labels. classifierRegion = model._getAnomalyClassifier() classifierRegionPy = classifierRegion.getSelf() # We need to set this classification type. It is supposed to be the default # but is not for some reason. classifierRegionPy.classificationVectorType = 2 with open (findDataset(_DATA_PATH)) as fin: reader = csv.reader(fin) headers = reader.next() csvWriter = csv.writer(open(_OUTPUT_PATH,"wb")) csvWriter.writerow(["x", "sinx", "anomaly_score", "anomalyLabel"]) for i, record in enumerate(reader, start=1): modelInput = dict(zip(headers, record)) modelInput["sinx"] = float(modelInput["sinx"]) result = model.run(modelInput) anomalyScore = result.inferences['anomalyScore'] anomalyLabel = result.inferences['anomalyLabel'] # Convert the anomaly label into either 0 or 1 if anomalyLabel == "[]": anomalyLabel = 0 elif anomalyLabel == "['myAnomaly']": anomalyLabel = 1.0 print "Anomaly detected at record",i # Manually tell the classifier to learn the first few artificial # anomalies. From there it should catch many of the following # anomalies, even though the anomaly sore might be low. if i == 2505: print "Adding labeled anomalies for record",i classifierRegion.executeCommand(["addLabel","2498","2503","myAnomaly"]) anomalyLabel = 1.0 if i == 2605: print "Adding labeled anomalies for record",i classifierRegion.executeCommand(["addLabel","2598","2603","myAnomaly"]) anomalyLabel = 1.0 if i == 2705: print "Adding labeled anomalies for record",i classifierRegion.executeCommand(["addLabel","2698","2703","myAnomaly"]) anomalyLabel = 1.0 csvWriter.writerow([i, modelInput["sinx"], anomalyScore, anomalyLabel]) print "Anomaly scores have been written to",_OUTPUT_PATH print "The following labels were stored in the classifier:" labels = eval(classifierRegion.executeCommand(["getLabels"])) pprint.pprint(labels)
"bottomUpOut").nonzero()[0] # Calculate the anomaly score using the active columns # and previous predicted columns anomalyScore = computeRawAnomalyScore(activeColumns, prevPredictedColumns) # Write out the anomaly score along with the record number and consumption # value. consumption = sensorRegion.getOutputData("sourceOut")[0] writer.writerow((i, consumption, anomalyScore)) # Store the predicted columns for the next timestep predictedColumns = temporalPoolerRegion.getOutputData( "topDownOut").nonzero()[0] prevPredictedColumns = copy.deepcopy(predictedColumns) i += 1 if __name__ == "__main__": trainFile = findDataset(_DATA_PATH) dataSource = FileRecordStream(streamID=trainFile) network = createNetwork(dataSource) outputPath = os.path.join(os.path.dirname(__file__), _OUTPUT_PATH) with open(outputPath, "w") as outputFile: writer = csv.writer(outputFile) print "Writing output to %s" % outputPath runNetwork(network, writer)
def generateStats(filename, statsInfo, maxSamples = None, filters=[], cache=True): """Generate requested statistics for a dataset and cache to a file. If filename is None, then don't cache to a file""" # Sanity checking if not isinstance(statsInfo, dict): raise RuntimeError("statsInfo must be a dict -- " "found '%s' instead" % type(statsInfo)) filename = findDataset(filename) if cache: statsFilename = getStatsFilename(filename, statsInfo, filters) # Use cached stats if found AND if it has the right data if os.path.exists(statsFilename): try: r = pickle.load(open(statsFilename, "rb")) except: # Ok to ignore errors -- we will just re-generate the file print "Warning: unable to load stats for %s -- " \ "will regenerate" % filename r = dict() requestedKeys = set([s for s in statsInfo]) availableKeys = set(r.keys()) unavailableKeys = requestedKeys.difference(availableKeys) if len(unavailableKeys ) == 0: return r else: print "generateStats: re-generating stats file %s because " \ "keys %s are not available" % \ (filename, str(unavailableKeys)) os.remove(filename) print "Generating statistics for file '%s' with filters '%s'" % (filename, filters) sensor = RecordSensor() sensor.dataSource = FileRecordStream(filename) sensor.preEncodingFilters = filters # Convert collector description to collector object stats = [] for field in statsInfo: # field = key from statsInfo if statsInfo[field] == "number": # This wants a field name e.g. consumption and the field type as the value statsInfo[field] = NumberStatsCollector() elif statsInfo[field] == "category": statsInfo[field] = CategoryStatsCollector() else: raise RuntimeError("Unknown stats type '%s' for field '%s'" % (statsInfo[field], field)) # Now collect the stats if maxSamples is None: maxSamples = 500000 for i in xrange(maxSamples): try: record = sensor.getNextRecord() except StopIteration: break for (name, collector) in statsInfo.items(): collector.add(record[name]) del sensor # Assemble the results and return r = dict() for (field, collector) in statsInfo.items(): stats = collector.getStats() if field not in r: r[field] = stats else: r[field].update(stats) if cache: f = open(statsFilename, "wb") pickle.dump(r, f) f.close() # caller may need to know name of cached file r["_filename"] = statsFilename return r
def runGeospatialAnomaly(dataPath, outputPath, scale=False, autoSequence=True, useTimeEncoders=False, verbose=False): model = createModel(useTimeEncoders, scale, verbose) with open (findDataset(dataPath)) as fin: reader = csv.reader(fin) csvWriter = csv.writer(open(outputPath,"wb")) csvWriter.writerow(["timestamp", "longitude", "latitude", "speed", "anomaly_score", "new_sequence"]) reader.next() reader.next() reader.next() lastTimestamp = None lastTrackName = None for _, record in enumerate(reader, start=1): trackName = record[0] timestamp = datetime.datetime.fromtimestamp(int(record[1]) / 1e3) longitude = float(record[2]) latitude = float(record[3]) speed = float(record[5]) accuracy = float(record[7]) if accuracy > ACCURACY_THRESHOLD: continue newSequence = False # Handle the automatic sequence creation if autoSequence: if lastTimestamp and ( (timestamp - lastTimestamp).total_seconds() > INTERVAL_THRESHOLD): newSequence = True # Manual sequence resets depend on the track name else: if trackName != lastTrackName: newSequence = True lastTimestamp = timestamp lastTrackName = trackName if newSequence: if verbose: print "Starting new sequence..." model.resetSequenceStates() modelInput = { "vector": (longitude, latitude, speed) } if useTimeEncoders: modelInput["timestamp"] = timestamp result = model.run(modelInput) anomalyScore = result.inferences['anomalyScore'] csvWriter.writerow([timestamp, longitude, latitude, speed, anomalyScore, 1 if newSequence else 0]) if verbose: print "[{0}] - Anomaly score: {1}.".format(timestamp, anomalyScore) print "Anomaly scores have been written to {0}".format(outputPath)