def testCaseSingleSpike(self):
        """
        No anomalies, and then you see a single spike. That spike should be an
        anomaly.
        """
        an = AnomalyLikelihood(100)
        for _ in range(1000):
            an.compute(0)
        anom = an.compute(1)

        self.assertAlmostEqual(anom, 1.0, places=3)
    def testCaseUnusuallyHighSpikeFrequency(self):
        """
        Test B: one anomaly spike every 20 records. Then we suddenly get a bunch
        in a row. The likelihood of those spikes should be high.
        """
        an = AnomalyLikelihood()

        data = self._addSampleData(spikePeriod=20, numSamples=3000)
        anom = [an.compute(x) for x in data]

        # If we continue to see the same distribution, we should get reasonable
        # likelihoods.
        max_anom = max(anom[-100:])
        self.assertTrue(max_anom < threshold)

        # Make 20 spikes in a row.
        anom = [an.compute(1.0) for x in range(20)]
        # Check for anomaly detected.
        self.assertTrue(max(anom) > threshold)
    def testCaseIncreasedAnomalyScore(self):
        """
        Test F: small anomaly score every 20 records, but then a large one when you
        would expect a small one. This should be anomalous.
        """
        an = AnomalyLikelihood()

        data = self._addSampleData(spikePeriod=20,
                                   spikeValue=0.4,
                                   numSamples=3000)
        anom = [an.compute(x) for x in data]

        # Now feed in a larger magnitude distribution.
        data = self._addSampleData(spikePeriod=20,
                                   spikeValue=1.0,
                                   numSamples=100)
        anom = [an.compute(x) for x in data]

        self.assertTrue(max(anom) > threshold)
    def testCaseContinuousBunchesOfSpikes(self):
        """
        Test D: bunches of anomalies every 40 records that continue. This should
        not be anomalous.
        """
        an = AnomalyLikelihood()

        # Generate initial data
        data = []
        for _ in range(30):
            data = self._addSampleData(data, spikePeriod=0, numSamples=30)
            data = self._addSampleData(data, spikePeriod=3, numSamples=10)

        anom = [an.compute(x) for x in data[:1000]]

        # Now feed in the same distribution
        data = self._addSampleData(spikePeriod=0, numSamples=30)
        data = self._addSampleData(data, spikePeriod=3, numSamples=10)
        anom = [an.compute(x) for x in data]

        self.assertTrue(max(anom) < threshold)
    def testCaseIncreasedSpikeFrequency(self):
        """
        Test E: bunches of anomalies every 20 records that become even more
        frequent. This should be anomalous.
        """
        an = AnomalyLikelihood(500)

        # Generate initial data
        data = []
        for _ in range(30):
            data = self._addSampleData(data, spikePeriod=0, numSamples=30)
            data = self._addSampleData(data, spikePeriod=3, numSamples=10)

        anom = [an.compute(x) for x in data[:1000]]

        # Now feed in a more frequent distribution
        data = self._addSampleData(spikePeriod=0, numSamples=30)
        data = self._addSampleData(data, spikePeriod=1, numSamples=10)
        anom = [an.compute(x) for x in data]

        # The likelihood should become anomalous but only near the end
        self.assertTrue(max(anom[0:30]) < threshold)
        self.assertTrue(max(anom[30:]) > threshold)
Example #6
0
def main(parameters=default_parameters, argv=None, verbose=True):
    if verbose:
        import pprint
        print("Parameters:")
        pprint.pprint(parameters, indent=4)
        print("")

    # Read the input file.
    records = []
    with open(_INPUT_FILE_PATH, "r") as fin:
        reader = csv.reader(fin)
        headers = next(reader)
        next(reader)
        next(reader)
        for record in reader:
            records.append(record)

    # Make the Encoders.  These will convert input data into binary representations.
    dateEncoder = DateEncoder(timeOfDay=parameters["enc"]["time"]["timeOfDay"],
                              weekend=parameters["enc"]["time"]["weekend"])

    scalarEncoderParams = RDSE_Parameters()
    scalarEncoderParams.size = parameters["enc"]["value"]["size"]
    scalarEncoderParams.sparsity = parameters["enc"]["value"]["sparsity"]
    scalarEncoderParams.resolution = parameters["enc"]["value"]["resolution"]
    scalarEncoder = RDSE(scalarEncoderParams)
    encodingWidth = (dateEncoder.size + scalarEncoder.size)
    enc_info = Metrics([encodingWidth], 999999999)

    # Make the HTM.  SpatialPooler & TemporalMemory & associated tools.
    spParams = parameters["sp"]
    sp = SpatialPooler(inputDimensions=(encodingWidth, ),
                       columnDimensions=(spParams["columnCount"], ),
                       potentialPct=spParams["potentialPct"],
                       potentialRadius=encodingWidth,
                       globalInhibition=True,
                       localAreaDensity=spParams["localAreaDensity"],
                       synPermInactiveDec=spParams["synPermInactiveDec"],
                       synPermActiveInc=spParams["synPermActiveInc"],
                       synPermConnected=spParams["synPermConnected"],
                       boostStrength=spParams["boostStrength"],
                       wrapAround=True)
    sp_info = Metrics(sp.getColumnDimensions(), 999999999)

    tmParams = parameters["tm"]
    tm = TemporalMemory(
        columnDimensions=(spParams["columnCount"], ),
        cellsPerColumn=tmParams["cellsPerColumn"],
        activationThreshold=tmParams["activationThreshold"],
        initialPermanence=tmParams["initialPerm"],
        connectedPermanence=spParams["synPermConnected"],
        minThreshold=tmParams["minThreshold"],
        maxNewSynapseCount=tmParams["newSynapseCount"],
        permanenceIncrement=tmParams["permanenceInc"],
        permanenceDecrement=tmParams["permanenceDec"],
        predictedSegmentDecrement=0.0,
        maxSegmentsPerCell=tmParams["maxSegmentsPerCell"],
        maxSynapsesPerSegment=tmParams["maxSynapsesPerSegment"])
    tm_info = Metrics([tm.numberOfCells()], 999999999)

    anomaly_history = AnomalyLikelihood(parameters["anomaly"]["period"])

    predictor = Predictor(steps=[1, 5],
                          alpha=parameters["predictor"]['sdrc_alpha'])
    predictor_resolution = 1

    # Iterate through every datum in the dataset, record the inputs & outputs.
    inputs = []
    anomaly = []
    anomalyProb = []
    predictions = {1: [], 5: []}
    for count, record in enumerate(records):

        # Convert date string into Python date object.
        dateString = datetime.datetime.strptime(record[0], "%m/%d/%y %H:%M")
        # Convert data value string into float.
        consumption = float(record[1])
        inputs.append(consumption)

        # Call the encoders to create bit representations for each value.  These are SDR objects.
        dateBits = dateEncoder.encode(dateString)
        consumptionBits = scalarEncoder.encode(consumption)

        # Concatenate all these encodings into one large encoding for Spatial Pooling.
        encoding = SDR(encodingWidth).concatenate([consumptionBits, dateBits])
        enc_info.addData(encoding)

        # Create an SDR to represent active columns, This will be populated by the
        # compute method below. It must have the same dimensions as the Spatial Pooler.
        activeColumns = SDR(sp.getColumnDimensions())

        # Execute Spatial Pooling algorithm over input space.
        sp.compute(encoding, True, activeColumns)
        sp_info.addData(activeColumns)

        # Execute Temporal Memory algorithm over active mini-columns.
        tm.compute(activeColumns, learn=True)
        tm_info.addData(tm.getActiveCells().flatten())

        # Predict what will happen, and then train the predictor based on what just happened.
        pdf = predictor.infer(tm.getActiveCells())
        for n in (1, 5):
            if pdf[n]:
                predictions[n].append(np.argmax(pdf[n]) * predictor_resolution)
            else:
                predictions[n].append(float('nan'))

        anomaly.append(tm.anomaly)
        anomalyProb.append(anomaly_history.compute(tm.anomaly))

        predictor.learn(count, tm.getActiveCells(),
                        int(consumption / predictor_resolution))

    # Print information & statistics about the state of the HTM.
    print("Encoded Input", enc_info)
    print("")
    print("Spatial Pooler Mini-Columns", sp_info)
    print(str(sp))
    print("")
    print("Temporal Memory Cells", tm_info)
    print(str(tm))
    print("")

    # Shift the predictions so that they are aligned with the input they predict.
    for n_steps, pred_list in predictions.items():
        for x in range(n_steps):
            pred_list.insert(0, float('nan'))
            pred_list.pop()

    # Calculate the predictive accuracy, Root-Mean-Squared
    accuracy = {1: 0, 5: 0}
    accuracy_samples = {1: 0, 5: 0}

    for idx, inp in enumerate(inputs):
        for n in predictions:  # For each [N]umber of time steps ahead which was predicted.
            val = predictions[n][idx]
            if not math.isnan(val):
                accuracy[n] += (inp - val)**2
                accuracy_samples[n] += 1
    for n in sorted(predictions):
        accuracy[n] = (accuracy[n] / accuracy_samples[n])**.5
        print("Predictive Error (RMS)", n, "steps ahead:", accuracy[n])

    # Show info about the anomaly (mean & std)
    print("Anomaly Mean", np.mean(anomaly))
    print("Anomaly Std ", np.std(anomaly))

    # Plot the Predictions and Anomalies.
    if verbose:
        try:
            import matplotlib.pyplot as plt
        except:
            print(
                "WARNING: failed to import matplotlib, plots cannot be shown.")
            return -accuracy[5]

        plt.subplot(2, 1, 1)
        plt.title("Predictions")
        plt.xlabel("Time")
        plt.ylabel("Power Consumption")
        plt.plot(
            np.arange(len(inputs)),
            inputs,
            'red',
            np.arange(len(inputs)),
            predictions[1],
            'blue',
            np.arange(len(inputs)),
            predictions[5],
            'green',
        )
        plt.legend(labels=('Input', '1 Step Prediction, Shifted 1 step',
                           '5 Step Prediction, Shifted 5 steps'))

        plt.subplot(2, 1, 2)
        plt.title("Anomaly Score")
        plt.xlabel("Time")
        plt.ylabel("Power Consumption")
        inputs = np.array(inputs) / max(inputs)
        plt.plot(
            np.arange(len(inputs)),
            inputs,
            'black',
            np.arange(len(inputs)),
            anomaly,
            'blue',
            np.arange(len(inputs)),
            anomalyProb,
            'red',
        )
        plt.legend(labels=('Input', 'Instantaneous Anomaly',
                           'Anomaly Likelihood'))
        plt.show()

    return -accuracy[5]
Example #7
0
class HtmcoreDetector(AnomalyDetector):
    """
  This detector uses an HTM based anomaly detection technique.
  """
    def __init__(self, *args, **kwargs):

        super(HtmcoreDetector, self).__init__(*args, **kwargs)

        ## API for controlling settings of htm.core HTM detector:

        # Set this to False if you want to get results based on raw scores
        # without using AnomalyLikelihood. This will give worse results, but
        # useful for checking the efficacy of AnomalyLikelihood. You will need
        # to re-optimize the thresholds when running with this setting.
        self.useLikelihood = True
        self.useSpatialAnomaly = True
        self.verbose = True
        # Add the "HTMCORE_OPTIMIZE" flag to the environment variables to use the optimization.
        # If present, it reads the parameters from ./params.json
        # If absent, it uses the global variable "default_parameters".
        self.use_optimization = 'HTMCORE_OPTIMIZE' in os.environ

        ## internal members
        # (listed here for easier understanding)
        # initialized in `initialize()`
        self.encTimestamp = None
        self.encValue = None
        self.sp = None
        self.tm = None
        self.anLike = None
        # optional debug info
        self.enc_info = None
        self.sp_info = None
        self.tm_info = None
        # internal helper variables:
        self.inputs_ = []
        self.iteration_ = 0

    def getAdditionalHeaders(self):
        """Returns a list of strings."""
        return ["raw_score"]  #TODO optional: add "prediction"

    def handleRecord(self, inputData):
        """Returns a tuple (anomalyScore, rawScore).

    @param inputData is a dict {"timestamp" : Timestamp(), "value" : float}

    @return tuple (anomalyScore, <any other fields specified in `getAdditionalHeaders()`>, ...)
    """
        # Send it to Numenta detector and get back the results
        return self.modelRun(inputData["timestamp"], inputData["value"])

    def initialize(self):
        # toggle parameters here
        if self.use_optimization:
            parameters = read_params('params.json')
        else:
            parameters = default_parameters

        # setup spatial anomaly
        if self.useSpatialAnomaly:
            # Keep track of value range for spatial anomaly detection
            self.minVal = None
            self.maxVal = None

        ## setup Enc, SP, TM, Likelihood
        # Make the Encoders.  These will convert input data into binary representations.
        self.encTimestamp = DateEncoder(
            timeOfDay=parameters["enc"]["time"]["timeOfDay"],
            weekend=parameters["enc"]["time"]["weekend"])

        scalarEncoderParams = RDSE_Parameters()
        scalarEncoderParams.size = parameters["enc"]["value"]["size"]
        scalarEncoderParams.sparsity = parameters["enc"]["value"]["sparsity"]
        scalarEncoderParams.resolution = parameters["enc"]["value"][
            "resolution"]

        self.encValue = RDSE(scalarEncoderParams)
        encodingWidth = (self.encTimestamp.size + self.encValue.size)
        self.enc_info = Metrics([encodingWidth], 999999999)

        # Make the HTM.  SpatialPooler & TemporalMemory & associated tools.
        # SpatialPooler
        spParams = parameters["sp"]
        self.sp = SpatialPooler(
            inputDimensions=(encodingWidth, ),
            columnDimensions=(spParams["columnCount"], ),
            potentialPct=spParams["potentialPct"],
            potentialRadius=encodingWidth,
            globalInhibition=True,
            localAreaDensity=spParams["localAreaDensity"],
            synPermInactiveDec=spParams["synPermInactiveDec"],
            synPermActiveInc=spParams["synPermActiveInc"],
            synPermConnected=spParams["synPermConnected"],
            boostStrength=spParams["boostStrength"],
            wrapAround=True,
            seed=0,
        )
        self.sp_info = Metrics(self.sp.getColumnDimensions(), 999999999)

        # TemporalMemory
        tmParams = parameters["tm"]
        self.tm = TemporalMemory(
            columnDimensions=(spParams["columnCount"], ),
            cellsPerColumn=tmParams["cellsPerColumn"],
            activationThreshold=tmParams["activationThreshold"],
            initialPermanence=tmParams["initialPerm"],
            connectedPermanence=spParams["synPermConnected"],
            minThreshold=tmParams["minThreshold"],
            maxNewSynapseCount=tmParams["newSynapseCount"],
            permanenceIncrement=tmParams["permanenceInc"],
            permanenceDecrement=tmParams["permanenceDec"],
            predictedSegmentDecrement=0.0,
            maxSegmentsPerCell=tmParams["maxSegmentsPerCell"],
            maxSynapsesPerSegment=tmParams["maxSynapsesPerSegment"],
            externalPredictiveInputs=self.encTimestamp.size,
            seed=0,
        )
        self.tm_info = Metrics([self.tm.numberOfCells()], 999999999)

        # setup likelihood, these settings are used in NAB
        if self.useLikelihood:
            learningPeriod = int(math.floor(self.probationaryPeriod / 2.0))
            self.anomalyLikelihood = AnomalyLikelihood(learningPeriod)
        # Predictor
        # self.predictor = Predictor( steps=[1, 5], alpha=parameters["predictor"]['sdrc_alpha'] )
        # predictor_resolution = 1

        # initialize pandaBaker
        if PANDA_VIS_BAKE_DATA:
            self.BuildPandaSystem(self.sp, self.tm,
                                  parameters["enc"]["value"]["size"],
                                  self.encTimestamp.size)

    def modelRun(self, ts, val):
        """
         Run a single pass through HTM model

         @params ts - Timestamp
         @params val - float input value

         @return rawAnomalyScore computed for the `val` in this step
      """
        ## run data through our model pipeline: enc -> SP -> TM -> Anomaly
        self.inputs_.append(val)
        self.iteration_ += 1

        # 1. Encoding
        # Call the encoders to create bit representations for each value.  These are SDR objects.
        dateBits = self.encTimestamp.encode(ts)
        valueBits = self.encValue.encode(float(val))
        # Concatenate all these encodings into one large encoding for Spatial Pooling.
        encoding = SDR(self.encTimestamp.size +
                       self.encValue.size).concatenate([valueBits, dateBits])
        self.enc_info.addData(encoding)

        # 2. Spatial Pooler
        # Create an SDR to represent active columns, This will be populated by the
        # compute method below. It must have the same dimensions as the Spatial Pooler.
        activeColumns = SDR(self.sp.getColumnDimensions())
        # Execute Spatial Pooling algorithm over input space.
        self.sp.compute(encoding, True, activeColumns)
        self.sp_info.addData(activeColumns)

        # 3. Temporal Memory
        # Execute Temporal Memory algorithm over active mini-columns.

        # to get predictive cells we need to call activateDendrites & activateCells separately
        if PANDA_VIS_BAKE_DATA:
            # activateDendrites calculates active segments
            self.tm.activateDendrites(learn=True)
            # predictive cells are calculated directly from active segments
            predictiveCells = self.tm.getPredictiveCells()
            # activates cells in columns by TM algorithm (winners, bursting...)
            self.tm.activateCells(activeColumns, learn=True)
        else:
            self.tm.compute(activeColumns, learn=True)

        self.tm_info.addData(self.tm.getActiveCells().flatten())

        # 4.1 (optional) Predictor #TODO optional
        #TODO optional: also return an error metric on predictions (RMSE, R2,...)

        # 4.2 Anomaly
        # handle spatial, contextual (raw, likelihood) anomalies
        # -Spatial
        spatialAnomaly = 0.0  #TODO optional: make this computed in SP (and later improve)
        if self.useSpatialAnomaly:
            # Update min/max values and check if there is a spatial anomaly
            if self.minVal != self.maxVal:
                tolerance = (self.maxVal - self.minVal) * SPATIAL_TOLERANCE
                maxExpected = self.maxVal + tolerance
                minExpected = self.minVal - tolerance
                if val > maxExpected or val < minExpected:
                    spatialAnomaly = 1.0
            if self.maxVal is None or val > self.maxVal:
                self.maxVal = val
            if self.minVal is None or val < self.minVal:
                self.minVal = val

        temporalAnomaly = raw = self.tm.anomaly
        if self.useLikelihood:
            temporalAnomaly = self.anomalyLikelihood.compute(temporalAnomaly)

        anomalyScore = max(
            spatialAnomaly,
            temporalAnomaly)  # this is the "main" anomaly, compared in NAB

        # 5. print stats
        if self.verbose and self.iteration_ % 1000 == 0:
            # print(self.enc_info)
            # print(self.sp_info)
            # print(self.tm_info)
            pass

        # 6. panda vis
        if PANDA_VIS_BAKE_DATA:
            # ------------------HTMpandaVis----------------------
            # see more about this structure at https://github.com/htm-community/HTMpandaVis/blob/master/pandaBaker/README.md
            # fill up values
            pandaBaker.inputs["Value"].stringValue = "value: {:.2f}".format(
                val)
            pandaBaker.inputs["Value"].bits = valueBits.sparse

            pandaBaker.inputs["TimeOfDay"].stringValue = str(ts)
            pandaBaker.inputs["TimeOfDay"].bits = dateBits.sparse

            pandaBaker.layers["Layer1"].activeColumns = activeColumns.sparse
            pandaBaker.layers["Layer1"].winnerCells = self.tm.getWinnerCells(
            ).sparse
            pandaBaker.layers[
                "Layer1"].predictiveCells = predictiveCells.sparse
            pandaBaker.layers["Layer1"].activeCells = self.tm.getActiveCells(
            ).sparse

            # customizable datastreams to be show on the DASH PLOTS
            pandaBaker.dataStreams["rawAnomaly"].value = temporalAnomaly
            pandaBaker.dataStreams["value"].value = val
            pandaBaker.dataStreams["numberOfWinnerCells"].value = len(
                self.tm.getWinnerCells().sparse)
            pandaBaker.dataStreams["numberOfPredictiveCells"].value = len(
                predictiveCells.sparse)
            pandaBaker.dataStreams[
                "valueInput_sparsity"].value = valueBits.getSparsity()
            pandaBaker.dataStreams[
                "dateInput_sparsity"].value = dateBits.getSparsity()

            pandaBaker.dataStreams[
                "Layer1_SP_overlap_metric"].value = self.sp_info.overlap.overlap
            pandaBaker.dataStreams[
                "Layer1_TM_overlap_metric"].value = self.sp_info.overlap.overlap
            pandaBaker.dataStreams[
                "Layer1_SP_activation_frequency"].value = self.sp_info.activationFrequency.mean(
                )
            pandaBaker.dataStreams[
                "Layer1_TM_activation_frequency"].value = self.tm_info.activationFrequency.mean(
                )
            pandaBaker.dataStreams[
                "Layer1_SP_entropy"].value = self.sp_info.activationFrequency.mean(
                )
            pandaBaker.dataStreams[
                "Layer1_TM_entropy"].value = self.tm_info.activationFrequency.mean(
                )

            pandaBaker.StoreIteration(self.iteration_ - 1)
            print("ITERATION: " + str(self.iteration_ - 1))

            # ------------------HTMpandaVis----------------------

        return (anomalyScore, raw)

    # with this method, the structure for visualization is defined
    def BuildPandaSystem(self, sp, tm, consumptionBits_size, dateBits_size):

        # we have two inputs connected to proximal synapses of Layer1
        pandaBaker.inputs["Value"] = cInput(consumptionBits_size)
        pandaBaker.inputs["TimeOfDay"] = cInput(dateBits_size)

        pandaBaker.layers["Layer1"] = cLayer(
            sp, tm)  # Layer1 has Spatial Pooler & Temporal Memory
        pandaBaker.layers["Layer1"].proximalInputs = [
            "Value",
            "TimeOfDay",
        ]
        pandaBaker.layers["Layer1"].distalInputs = ["Layer1"]

        # data for dash plots
        streams = [
            "rawAnomaly", "value", "numberOfWinnerCells",
            "numberOfPredictiveCells", "valueInput_sparsity",
            "dateInput_sparsity", "Layer1_SP_overlap_metric",
            "Layer1_TM_overlap_metric", "Layer1_SP_activation_frequency",
            "Layer1_TM_activation_frequency", "Layer1_SP_entropy",
            "Layer1_TM_entropy"
        ]

        pandaBaker.dataStreams = dict(
            (name, cDataStream())
            for name in streams)  # create dicts for more comfortable code
        # could be also written like: pandaBaker.dataStreams["myStreamName"] = cDataStream()

        pandaBaker.PrepareDatabase()