Example #1
0
    def test_GenerateDataset(self):
        dataset = 'extra/gym/gym.csv'

        print "Using input dataset: ", dataset

        gymFileds = None
        with FileRecordStream(findDataset(dataset)) as f:
            gymFields = f.getFieldNames()

        aggregationOptions = dict(timeField=gymFields.index('timestamp'),
                                  fields=[('attendeeCount', sum),
                                          ('consumption', sum),
                                          ('timestamp', lambda x: x[0])],
                                  hours=5)

        handle = \
          tempfile.NamedTemporaryFile(prefix='agg_gym_hours_5',
            suffix='.csv',
            dir=os.path.dirname(findDataset(dataset)))
        outputFile = handle.name
        handle.close()

        print "Expected outputFile path: ", outputFile

        print "Files in the destination folder before the test:"
        print os.listdir(os.path.abspath(os.path.dirname(
            findDataset(dataset))))

        if os.path.isfile(outputFile):
            print "Removing existing outputFile: ", outputFile
            os.remove(outputFile)

        self.assertFalse(os.path.exists(outputFile),
                         msg="Shouldn't exist, but does: " + str(outputFile))

        result = generateDataset(aggregationOptions, dataset, outputFile)
        print "generateDataset() returned: ", result

        f1 = os.path.abspath(os.path.normpath(result))
        print "normalized generateDataset() result path: ", f1
        f2 = os.path.normpath(outputFile)
        print "normalized outputFile path: ", f2
        self.assertEqual(f1, f2)

        print "Checking for presence of outputFile: ", outputFile
        self.assertTrue(
            os.path.isfile(outputFile),
            msg=
            "Missing outputFile: %r; normalized generateDataset() result: %r" %
            (outputFile, f1))

        print "Files in the destination folder after the test:"
        print os.listdir(os.path.abspath(os.path.dirname(
            findDataset(dataset))))

        print result
        print '-' * 30

        return
Example #2
0
  def test_GenerateDataset(self):
    dataset = 'extra/gym/gym.csv'

    print "Using input dataset: ", dataset

    gymFileds = None
    with FileRecordStream(findDataset(dataset)) as f:
      gymFields = f.getFieldNames()

    aggregationOptions = dict(
      timeField=gymFields.index('timestamp'),
      fields=[('attendeeCount', sum),
              ('consumption', sum),
              ('timestamp', lambda x: x[0])],

      hours=5
      )
    
    handle = \
      tempfile.NamedTemporaryFile(prefix='agg_gym_hours_5', 
        suffix='.csv', 
        dir=os.path.dirname(findDataset(dataset)))
    outputFile = handle.name
    handle.close()

    print "Expected outputFile path: ", outputFile

    print "Files in the destination folder before the test:"
    print os.listdir(os.path.abspath(os.path.dirname(findDataset(dataset))))

    if os.path.isfile(outputFile):
      print "Removing existing outputFile: ", outputFile
      os.remove(outputFile)

    self.assertFalse(os.path.exists(outputFile),
                     msg="Shouldn't exist, but does: " + str(outputFile))

    result = generateDataset(aggregationOptions, dataset, outputFile)
    print "generateDataset() returned: ", result

    f1 = os.path.abspath(os.path.normpath(result))
    print "normalized generateDataset() result path: ", f1
    f2 = os.path.normpath(outputFile)
    print "normalized outputFile path: ", f2
    self.assertEqual(f1, f2)

    print "Checking for presence of outputFile: ", outputFile
    self.assertTrue(
      os.path.isfile(outputFile),
      msg="Missing outputFile: %r; normalized generateDataset() result: %r" % (
        outputFile, f1))

    print "Files in the destination folder after the test:"
    print os.listdir(os.path.abspath(os.path.dirname(findDataset(dataset))))

    print result
    print '-' * 30

    return
Example #3
0
def getFilename(aggregationInfo, inputFile):
    """Generate the filename for aggregated dataset

  The filename is based on the input filename and the
  aggregation period.

  Returns the inputFile if no aggregation required (aggregation
  info has all 0's)
  """

    # Find the actual file, with an absolute path
    inputFile = findDataset(inputFile)

    a = defaultdict(lambda: 0, aggregationInfo)
    outputDir = os.path.dirname(inputFile)
    outputFile = 'agg_%s' % os.path.splitext(os.path.basename(inputFile))[0]
    noAggregation = True
    timePeriods = 'years months weeks days '\
                  'hours minutes seconds milliseconds microseconds'
    for k in timePeriods.split():
        if a[k] > 0:
            noAggregation = False
            outputFile += '_%s_%d' % (k, a[k])

    if noAggregation:
        return inputFile
    outputFile += '.csv'
    outputFile = os.path.join(outputDir, outputFile)

    return outputFile
Example #4
0
def _createLPFNetwork(addSP=True, addTP=False):
    """Create an 'old-style' network ala LPF and return it."""

    # ==========================================================================
    # Create the encoder and data source stuff we need to configure the sensor
    sensorParams = dict(verbosity=_VERBOSITY)
    encoder = _createEncoder()
    trainFile = findDataset("extra/gym/gym.csv")
    dataSource = FileRecordStream(streamID=trainFile)
    dataSource.setAutoRewind(True)

    # Create all the stuff we need to configure the CLARegion
    g_claConfig["spEnable"] = addSP
    g_claConfig["tpEnable"] = addTP
    claParams = _getCLAParams(encoder=encoder, config=g_claConfig)
    claParams["spSeed"] = g_claConfig["spSeed"]
    claParams["tpSeed"] = g_claConfig["tpSeed"]

    # ==========================================================================
    # Now create the network itself
    n = Network()

    n.addRegion("sensor", "py.RecordSensor", json.dumps(sensorParams))

    sensor = n.regions["sensor"].getSelf()
    sensor.encoder = encoder
    sensor.dataSource = dataSource

    n.addRegion("level1", "py.CLARegion", json.dumps(claParams))

    n.link("sensor", "level1", "UniformLink", "")
    n.link("sensor", "level1", "UniformLink", "", srcOutput="resetOut", destInput="resetIn")

    return n
Example #5
0
def runHotgym():
  model = createModel()
  model.enableInference({'predictedField': 'consumption'})
  metricsManager = MetricsManager(_METRIC_SPECS, model.getFieldInfo(),
                                  model.getInferenceType())
  with open (findDataset(_DATA_PATH)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    reader.next()
    reader.next()
    for i, record in enumerate(reader, start=1):
      modelInput = dict(zip(headers, record))
      modelInput["consumption"] = float(modelInput["consumption"])
      modelInput["timestamp"] = datetime.datetime.strptime(
          modelInput["timestamp"], "%m/%d/%y %H:%M")
      result = model.run(modelInput)
      result.metrics = metricsManager.update(result)
      isLast = i == _NUM_RECORDS
      if i % 100 == 0 or isLast:
        _LOGGER.info("After %i records, 1-step altMAPE=%f", i,
                    result.metrics["multiStepBestPredictions:multiStep:"
                                   "errorMetric='altMAPE':steps=1:window=1000:"
                                   "field=consumption"])
      if isLast:
        break
Example #6
0
def _createLPFNetwork(addSP = True, addTP = False):
  """Create an 'old-style' network ala LPF and return it."""

  # ==========================================================================
  # Create the encoder and data source stuff we need to configure the sensor
  sensorParams = dict(verbosity = _VERBOSITY)
  encoder = _createEncoder()
  trainFile = findDataset("extra/gym/gym.csv")
  dataSource = FileRecordStream(streamID=trainFile)
  dataSource.setAutoRewind(True)

  # Create all the stuff we need to configure the CLARegion
  g_claConfig['spEnable'] = addSP
  g_claConfig['tpEnable'] = addTP
  claParams = _getCLAParams(encoder = encoder, config= g_claConfig)
  claParams['spSeed'] = g_claConfig['spSeed']
  claParams['tpSeed'] = g_claConfig['tpSeed']

  # ==========================================================================
  # Now create the network itself
  n = Network()

  n.addRegion("sensor", "py.RecordSensor", json.dumps(sensorParams))

  sensor = n.regions['sensor'].getSelf()
  sensor.encoder = encoder
  sensor.dataSource = dataSource

  n.addRegion("level1", "py.CLARegion", json.dumps(claParams))

  n.link("sensor", "level1", "UniformLink", "")
  n.link("sensor", "level1", "UniformLink", "",
         srcOutput="resetOut", destInput="resetIn")

  return n
Example #7
0
def getFilename(aggregationInfo, inputFile):
  """Generate the filename for aggregated dataset

  The filename is based on the input filename and the
  aggregation period.

  Returns the inputFile if no aggregation required (aggregation
  info has all 0's)
  """

  # Find the actual file, with an absolute path
  inputFile = findDataset(inputFile)

  a = defaultdict(lambda: 0, aggregationInfo)
  outputDir = os.path.dirname(inputFile)
  outputFile = 'agg_%s' % os.path.splitext(os.path.basename(inputFile))[0]
  noAggregation = True
  timePeriods = 'years months weeks days '\
                'hours minutes seconds milliseconds microseconds'
  for k in timePeriods.split():
    if a[k] > 0:
      noAggregation = False
      outputFile += '_%s_%d' % (k, a[k])

  if noAggregation:
    return inputFile
  outputFile += '.csv'
  outputFile = os.path.join(outputDir, outputFile)

  return outputFile
Example #8
0
def runHotgym():
    model = createModel()
    model.enableInference({'predictedField': 'consumption'})
    metricsManager = MetricsManager(_METRIC_SPECS, model.getFieldInfo(),
                                    model.getInferenceType())
    with open(findDataset(_DATA_PATH)) as fin:
        reader = csv.reader(fin)
        headers = reader.next()
        reader.next()
        reader.next()
        for i, record in enumerate(reader, start=1):
            modelInput = dict(zip(headers, record))
            modelInput["consumption"] = float(modelInput["consumption"])
            modelInput["timestamp"] = datetime.datetime.strptime(
                modelInput["timestamp"], "%m/%d/%y %H:%M")
            result = model.run(modelInput)
            result.metrics = metricsManager.update(result)
            isLast = i == _NUM_RECORDS
            if i % 100 == 0 or isLast:
                _LOGGER.info(
                    "After %i records, 1-step altMAPE=%f", i,
                    result.metrics["multiStepBestPredictions:multiStep:"
                                   "errorMetric='altMAPE':steps=1:window=1000:"
                                   "field=consumption"])
            if isLast:
                break
Example #9
0
def runHotgymAnomaly():
    model = createModel()
    model.enableInference({'predictedField': 'consumption'})
    with open(findDataset(_DATA_PATH)) as fin:
        reader = csv.reader(fin)
        csvWriter = csv.writer(open(_OUTPUT_PATH, "wb"))
        csvWriter.writerow(["timestamp", "consumption", "anomaly_score"])
        headers = reader.next()
        reader.next()
        reader.next()
        for i, record in enumerate(reader, start=1):
            modelInput = dict(zip(headers, record))
            modelInput["consumption"] = float(modelInput["consumption"])
            modelInput["timestamp"] = datetime.datetime.strptime(
                modelInput["timestamp"], "%m/%d/%y %H:%M")
            result = model.run(modelInput)
            anomalyScore = result.inferences['anomalyScore']
            csvWriter.writerow([
                modelInput["timestamp"], modelInput["consumption"],
                anomalyScore
            ])
            if anomalyScore > _ANOMALY_THRESHOLD:
                _LOGGER.info("Anomaly detected at [%s]. Anomaly score: %f.",
                             result.rawInput["timestamp"], anomalyScore)

    print "Anomaly scores have been written to", _OUTPUT_PATH
Example #10
0
def runHotgym():
    model = createModel()
    model.enableInference({"predictionSteps": [1, 5], "predictedField": "consumption", "numRecords": 4000})
    print findDataset(DATA_PATH)
    with open(findDataset(DATA_PATH)) as fin:
        reader = csv.reader(fin)
        headers = reader.next()
        print headers
        print reader.next()
        print reader.next()
        for record in reader:
            print record
            modelInput = dict(zip(headers, record))
            modelInput["consumption"] = float(modelInput["consumption"])
            modelInput["timestamp"] = datetime.datetime.strptime(modelInput["timestamp"], "%Y-%m-%d %H:%M:%S.%f")
            result = model.run(modelInput)
            print result
Example #11
0
def _createOPFNetwork(addSP=True, addTP=False):
    """Create a 'new-style' network ala OPF and return it.
  If addSP is true, an SPRegion will be added named 'level1SP'.
  If addTP is true, a TPRegion will be added named 'level1TP'
  """

    # ==========================================================================
    # Create the encoder and data source stuff we need to configure the sensor
    sensorParams = dict(verbosity=_VERBOSITY)
    encoder = _createEncoder()
    trainFile = findDataset("extra/gym/gym.csv")
    dataSource = FileRecordStream(streamID=trainFile)
    dataSource.setAutoRewind(True)

    # ==========================================================================
    # Now create the network itself
    n = Network()
    n.addRegion("sensor", "py.RecordSensor", json.dumps(sensorParams))

    sensor = n.regions["sensor"].getSelf()
    sensor.encoder = encoder
    sensor.dataSource = dataSource

    # ==========================================================================
    # Add the SP if requested
    if addSP:
        print "Adding SPRegion"
        g_spRegionConfig["inputWidth"] = encoder.getWidth()
        n.addRegion("level1SP", "py.SPRegion", json.dumps(g_spRegionConfig))

        n.link("sensor", "level1SP", "UniformLink", "")
        n.link("sensor", "level1SP", "UniformLink", "", srcOutput="resetOut", destInput="resetIn")
        n.link("level1SP", "sensor", "UniformLink", "", srcOutput="spatialTopDownOut", destInput="spatialTopDownIn")
        n.link("level1SP", "sensor", "UniformLink", "", srcOutput="temporalTopDownOut", destInput="temporalTopDownIn")

    # ==========================================================================
    if addTP and addSP:
        # Add the TP on top of SP if requested
        # The input width of the TP is set to the column count of the SP
        print "Adding TPRegion on top of SP"
        g_tpRegionConfig["inputWidth"] = g_spRegionConfig["columnCount"]
        n.addRegion("level1TP", "py.TPRegion", json.dumps(g_tpRegionConfig))
        n.link("level1SP", "level1TP", "UniformLink", "")
        n.link("level1TP", "level1SP", "UniformLink", "", srcOutput="topDownOut", destInput="topDownIn")
        n.link("sensor", "level1TP", "UniformLink", "", srcOutput="resetOut", destInput="resetIn")

    elif addTP:
        # Add a lone TPRegion if requested
        # The input width of the TP is set to the encoder width
        print "Adding TPRegion"
        g_tpRegionConfig["inputWidth"] = encoder.getWidth()
        n.addRegion("level1TP", "py.TPRegion", json.dumps(g_tpRegionConfig))

        n.link("sensor", "level1TP", "UniformLink", "")
        n.link("sensor", "level1TP", "UniformLink", "", srcOutput="resetOut", destInput="resetIn")

    return n
Example #12
0
    def _openStream(self, dataUrl, isBlocking, maxTimeout, bookmark, firstRecordIdx):
        """Open the underlying file stream.

    This only supports 'file://' prefixed paths.
    """
        self._recordStoreName = findDataset(dataUrl[len(FILE_PREF) :])
        self._recordStore = FileRecordStream(
            streamID=self._recordStoreName, write=False, bookmark=bookmark, firstRecord=firstRecordIdx
        )
def runGeospatialAnomaly(dataPath, outputPath):
  model = createModel()

  with open (findDataset(dataPath)) as fin:
    reader = csv.reader(fin)
    csvWriter = csv.writer(open(outputPath,"wb"))
    csvWriter.writerow(["timestamp",
                       "longitude",
                       "latitude",
                       "speed",
                       "anomaly_score",
                       "new_sequence"])

    reader.next()
    reader.next()
    reader.next()

    lastTimestamp = None

    for _, record in enumerate(reader, start=1):
      timestamp = datetime.datetime.fromtimestamp(int(record[1]) / 1e3)
      longitude = float(record[2])
      latitude = float(record[3])
      speed = float(record[5])
      accuracy = float(record[7])

      if accuracy > ACCURACY_THRESHOLD:
        continue

      newSequence = False
      if lastTimestamp and (
        (timestamp - lastTimestamp).total_seconds() > INTERVAL_THRESHOLD):
        newSequence = True
      lastTimestamp = timestamp

      if newSequence:
        print "Starting new sequence..."
        model.resetSequenceStates()

      modelInput = {
        "vector": (longitude, latitude, speed)
      }
      result = model.run(modelInput)
      anomalyScore = result.inferences['anomalyScore']

      csvWriter.writerow([timestamp,
                          longitude,
                          latitude,
                          speed,
                          anomalyScore,
                          1 if newSequence else 0])

      print "[{0}] - Anomaly score: {1}.".format(timestamp, anomalyScore)

  print "Anomaly scores have been written to {0}".format(outputPath)
Example #14
0
    def _openStream(self, dataUrl, isBlocking, maxTimeout, bookmark,
                    firstRecordIdx):
        """Open the underlying file stream.

    This only supports 'file://' prefixed paths.
    """
        self._recordStoreName = findDataset(dataUrl[len(FILE_PREF):])
        self._recordStore = FileRecordStream(streamID=self._recordStoreName,
                                             write=False,
                                             bookmark=bookmark,
                                             firstRecord=firstRecordIdx)
Example #15
0
  def testFindDataset(self):
    # Test non-existing dataset (relative path)
    with self.assertRaises(Exception):
      findDataset('no_such_dataset.csv')

    # Test non-existing dataset (absolute path)
    with self.assertRaises(Exception):
      findDataset('/no_such_dataset.csv')

    # Test existing dataset (relative path)
    if not os.path.isdir('data'):
      os.makedirs('data')
    datasetPath = 'test_find_dataset.csv'
    filename = 'data/test_find_dataset.csv'
    # This is the uncompressed name.
    fullPath = os.path.abspath(filename)
    if os.path.exists(fullPath):
      os.remove(fullPath)
    fullPathCompressed = fullPath + ".gz"
    if os.path.exists(fullPathCompressed):
      os.remove(fullPathCompressed)

    # Create the "dataset"
    open(filename, 'w').write('123')
    path = findDataset(datasetPath)
    self.assertEqual(path, fullPath)
    self.assertTrue(os.path.exists(path))

    # This should do nothing, since it is already compressed
    path = uncompressAndCopyDataset(path)
    self.assertEqual(path, fullPath)

    # Test existing dataset (absolute path)
    self.assertEqual(findDataset(fullPath), fullPath)

    # Test existing dataset (compressed path)

    # Create the compressed file
    import gzip
    f = gzip.GzipFile(fullPathCompressed, 'w')
    f.write("1,2,3\n")
    f.close()
    self.assertTrue(os.path.isfile(fullPathCompressed))

    # Remove the original file
    os.remove(fullPath)

    self.assertEqual(findDataset(datasetPath), fullPathCompressed)
    # This should put the uncompressed file in the same directory
    path = uncompressAndCopyDataset(fullPathCompressed)
    self.assertEqual(path, fullPath)
    self.assertTrue(os.path.isfile(path))

    os.remove(fullPath)
    os.remove(fullPathCompressed)
Example #16
0
    def testFindDataset(self):
        # Test non-existing dataset (relative path)
        with self.assertRaises(Exception):
            findDataset('no_such_dataset.csv')

        # Test non-existing dataset (absolute path)
        with self.assertRaises(Exception):
            findDataset('/no_such_dataset.csv')

        # Test existing dataset (relative path)
        if not os.path.isdir('data'):
            os.makedirs('data')
        datasetPath = 'test_find_dataset.csv'
        filename = 'data/test_find_dataset.csv'
        # This is the uncompressed name.
        fullPath = os.path.abspath(filename)
        if os.path.exists(fullPath):
            os.remove(fullPath)
        fullPathCompressed = fullPath + ".gz"
        if os.path.exists(fullPathCompressed):
            os.remove(fullPathCompressed)

        # Create the "dataset"
        open(filename, 'w').write('123')
        path = findDataset(datasetPath)
        self.assertEqual(path, fullPath)
        self.assertTrue(os.path.exists(path))

        # This should do nothing, since it is already compressed
        path = uncompressAndCopyDataset(path)
        self.assertEqual(path, fullPath)

        # Test existing dataset (absolute path)
        self.assertEqual(findDataset(fullPath), fullPath)

        # Test existing dataset (compressed path)

        # Create the compressed file
        import gzip
        f = gzip.GzipFile(fullPathCompressed, 'w')
        f.write("1,2,3\n")
        f.close()
        self.assertTrue(os.path.isfile(fullPathCompressed))

        # Remove the original file
        os.remove(fullPath)

        self.assertEqual(findDataset(datasetPath), fullPathCompressed)
        # This should put the uncompressed file in the same directory
        path = uncompressAndCopyDataset(fullPathCompressed)
        self.assertEqual(path, fullPath)
        self.assertTrue(os.path.isfile(path))

        os.remove(fullPath)
        os.remove(fullPathCompressed)
Example #17
0
def runDemo():
    trainFile = findDataset(_INPUT_FILE_PATH)
    dataSource = FileRecordStream(streamID=trainFile)
    numRecords = dataSource.getDataRowCount()
    print "Creating network"
    network = createNetwork(dataSource)
    outputPath = os.path.join(os.path.dirname(__file__), _OUTPUT_FILE_NAME)
    with open(outputPath, "w") as outputFile:
        writer = csv.writer(outputFile)
        print "Running network"
        print "Writing output to: %s" % outputPath
        runNetwork(network, numRecords, writer)
    print "Hierarchy demo finished"
Example #18
0
def runDemo():
  trainFile = findDataset(_INPUT_FILE_PATH)
  dataSource = FileRecordStream(streamID=trainFile)
  numRecords = dataSource.getDataRowCount()
  print "Creating network"
  network = createNetwork(dataSource)
  outputPath = os.path.join(os.path.dirname(__file__), _OUTPUT_FILE_NAME)
  with open(outputPath, "w") as outputFile:
    writer = csv.writer(outputFile)
    print "Running network"
    print "Writing output to: %s" % outputPath
    runNetwork(network, numRecords, writer)
  print "Hierarchy demo finished"
Example #19
0
def runHotgym():
    model = createModel()
    model.enableInference({
        'predictionSteps': [1, 5],
        'predictedField': 'consumption',
        'numRecords': 4000
    })
    print findDataset(DATA_PATH)
    with open(findDataset(DATA_PATH)) as fin:
        reader = csv.reader(fin)
        headers = reader.next()
        print headers
        print reader.next()
        print reader.next()
        for record in reader:
            print record
            modelInput = dict(zip(headers, record))
            modelInput["consumption"] = float(modelInput["consumption"])
            modelInput["timestamp"] = datetime.datetime.strptime(
                modelInput["timestamp"], "%Y-%m-%d %H:%M:%S.%f")
            result = model.run(modelInput)
            print result
Example #20
0
  def test_GymAggregateWithOldData(self):
    filename = findDataset('extra/gym/gym.csv')

    input = []

    gymFields = None

    with FileRecordStream(filename) as f:
      gymFields = f.getFields()
      for i in range(10):
        input.append(f.getNextRecord())

    #Append the records from the beginning to the end of the dataset
    input.extend(input[0:3])
    for h in (1,3):
      aggregationOptions = dict(
        fields=[
          ('timestamp', lambda x: x[0],),
          ('attendeeCount', sum),
          ('consumption', sum)],
        hours=h
      )


      handle = \
        tempfile.NamedTemporaryFile(prefix='test', 
          suffix='.bin')
      outputFile = handle.name
      handle.close()
      
      dataInput = DataInputList(input, gymFields)
      dataOutput = DataOutputList(None)

      _aggregate(input=dataInput, options=aggregationOptions, 
                 timeFieldName='timestamp', output=dataOutput)
      dataOutput.close()

      outputRecords = dataOutput._store
      
      timeFieldIdx = [f[0] for f in gymFields].index('timestamp')
      diffs = []
      for i in range(1,len(outputRecords)):
        diffs.append(outputRecords[i][timeFieldIdx] - \
                     outputRecords[i-1][timeFieldIdx])
      positiveTimeFlow = map((lambda x: x < datetime.timedelta(seconds=0)), 
                            diffs)
      #Make sure that old records are in the aggregated output and at the same
      #time make sure that they are in consecutive order after being inserted
      self.assertEquals(sum(positiveTimeFlow), 1)
        
    return
Example #21
0
    def test_GymAggregateWithOldData(self):
        filename = findDataset('extra/gym/gym.csv')

        input = []

        gymFields = None

        with FileRecordStream(filename) as f:
            gymFields = f.getFields()
            for i in range(10):
                input.append(f.getNextRecord())

        #Append the records from the beginning to the end of the dataset
        input.extend(input[0:3])
        for h in (1, 3):
            aggregationOptions = dict(fields=[(
                'timestamp',
                lambda x: x[0],
            ), ('attendeeCount', sum), ('consumption', sum)],
                                      hours=h)


            handle = \
              tempfile.NamedTemporaryFile(prefix='test',
                suffix='.bin')
            outputFile = handle.name
            handle.close()

            dataInput = DataInputList(input, gymFields)
            dataOutput = DataOutputList(None)

            _aggregate(input=dataInput,
                       options=aggregationOptions,
                       timeFieldName='timestamp',
                       output=dataOutput)
            dataOutput.close()

            outputRecords = dataOutput._store

            timeFieldIdx = [f[0] for f in gymFields].index('timestamp')
            diffs = []
            for i in range(1, len(outputRecords)):
                diffs.append(outputRecords[i][timeFieldIdx] - \
                             outputRecords[i-1][timeFieldIdx])
            positiveTimeFlow = map(
                (lambda x: x < datetime.timedelta(seconds=0)), diffs)
            #Make sure that old records are in the aggregated output and at the same
            #time make sure that they are in consecutive order after being inserted
            self.assertEquals(sum(positiveTimeFlow), 1)

        return
def trainAndClassify(trainingSetSize, model, data_path, results_path):
  """
  In this function we explicitly label specific portions of the data stream.
  Any later record that matches the pattern will get labeled the same way.
  """

  model.enableInference({'predictedField': 'y'})

  # Here we will get the classifier instance so we can add and query labels.
  classifierRegion = model._getAnomalyClassifier()
  classifierRegionPy = classifierRegion.getSelf()

  # We need to set this classification type. It is supposed to be the default
  # but is not for some reason.
  classifierRegionPy.classificationVectorType = 2

  with open (findDataset(data_path)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    csvWriter = csv.writer(open(results_path,"wb"))
    csvWriter.writerow(["x", "y", "trueLabel", "anomalyScore", "predictedLabel"])
    for x, record in enumerate(reader):
      modelInput = dict(zip(headers, record))
      modelInput["y"] = float(modelInput["y"])
      trueLabel = modelInput["label"]
      result = model.run(modelInput)
      anomalyScore = result.inferences['anomalyScore']
      predictedLabel = result.inferences['anomalyLabel']
      if predictedLabel == "[]":
        predictedLabel = 'label0'
      else:
        predictedLabel = result.inferences['anomalyLabel'][2:-2]

      # relabel prediction for all the records with indices withing the training set size range.
      if x < trainingSetSize:
        for label in CLASS_RANGES:
          for class_range in CLASS_RANGES[label]:
            start = class_range['start']
            end = class_range['end']

            if start <= x <= end:
              predictedLabel = label

            if x == end + 2:
              print "Adding labeled anomalies for record", x
              classifierRegion.executeCommand(["addLabel", str(start), str(end + 1), label])

      csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, predictedLabel])

  print "Results scores have been written to %s" % results_path
Example #23
0
def trainAndClassify(model, data_path, results_path):
  """
  In this function we explicitly label specific portions of the data stream.
  Any later record that matches the pattern will get labeled the same way.
  """

  model.enableInference({'predictedField': 'label'})
  model.enableLearning()

  # Here we will get the classifier instance so we can add and query labels.
  classifierRegion = model._getAnomalyClassifier()
  classifierRegionPy = classifierRegion.getSelf()

  # We need to set this classification type. It is supposed to be the default
  # but is not for some reason.
  classifierRegionPy.classificationVectorType = 2

  with open (findDataset(data_path)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    #skip the 2 first row
    reader.next()
    reader.next()
    
    csvWriter = csv.writer(open(results_path,"wb"))
    csvWriter.writerow(["x", "y", "trueLabel", "anomalyScore", "predictedLabel"])
    for x, record in enumerate(reader):
      modelInput = dict(zip(headers, record))
      modelInput["y"] = float(modelInput["y"])
      trueLabel = modelInput["label"]
      result = model.run(modelInput)
      anomalyScore = result.inferences['anomalyScore']
      predictedLabel = result.inferences['anomalyLabel'][2:-2]
      
      if x < SP_TRAINING_SET_SIZE: # wait until the SP has seen the 3 classes at lest one time
        csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, 'SP_TRAINING'])
      
      elif SP_TRAINING_SET_SIZE <= x < TM_TRAINING_SET_SIZE: 
        csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, 'TM_TRAINING'])
      
      elif TM_TRAINING_SET_SIZE <= x < CLASSIFIER_TRAINING_SET_SIZE: # relabel predictions (i.e. train classifier)
        csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, 'CLASSIFIER_TRAINING'])
        classifierRegion.executeCommand(["addLabel", str(x), str(x + 1), trueLabel])

      elif x>= CLASSIFIER_TRAINING_SET_SIZE: # predict
        csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, predictedLabel])

      

  print "Results have been written to %s" % results_path
Example #24
0
def runHotgymAnomaly():
    model = createModel()
    model.enableInference({"predictedField": "consumption"})
    with open(findDataset(_DATA_PATH)) as fin:
        reader = csv.reader(fin)
        headers = reader.next()
        reader.next()
        reader.next()
        for i, record in enumerate(reader, start=1):
            modelInput = dict(zip(headers, record))
            modelInput["consumption"] = float(modelInput["consumption"])
            modelInput["timestamp"] = datetime.datetime.strptime(modelInput["timestamp"], "%m/%d/%y %H:%M")
            result = model.run(modelInput)
            anomalyScore = result.inferences["anomalyScore"]
            if anomalyScore > _ANOMALY_THRESHOLD:
                _LOGGER.info("Anomaly detected at [%s]. Anomaly score: %f.", result.rawInput["timestamp"], anomalyScore)
Example #25
0
    def testSaveAndReload(self):
        """
    This function tests saving and loading. It will train a network for 500
    iterations, then save it and reload it as a second network instance. It will
    then run both networks for 100 iterations and ensure they return identical
    results.
    """

        print "Creating network..."

        netOPF = _createOPFNetwork()
        level1OPF = netOPF.regions['level1SP']

        # ==========================================================================
        print "Training network for 500 iterations"
        level1OPF.setParameter('learningMode', 1)
        level1OPF.setParameter('inferenceMode', 0)
        netOPF.run(500)
        level1OPF.setParameter('learningMode', 0)
        level1OPF.setParameter('inferenceMode', 1)

        # ==========================================================================
        # Save network and reload as a second instance. We need to reset the data
        # source for the unsaved network so that both instances start at the same
        # place
        print "Saving and reload network"
        _, tmpNetworkFilename = _setupTempDirectory("trained.nta")
        netOPF.save(tmpNetworkFilename)
        netOPF2 = Network(tmpNetworkFilename)
        level1OPF2 = netOPF2.regions['level1SP']

        sensor = netOPF.regions['sensor'].getSelf()
        trainFile = findDataset("extra/gym/gym.csv")
        sensor.dataSource = FileRecordStream(streamID=trainFile)
        sensor.dataSource.setAutoRewind(True)

        # ==========================================================================
        print "Running inference on the two networks for 100 iterations"
        for _ in xrange(100):
            netOPF2.run(1)
            netOPF.run(1)
            l1outputOPF2 = level1OPF2.getOutputData("bottomUpOut")
            l1outputOPF = level1OPF.getOutputData("bottomUpOut")
            opfHash2 = l1outputOPF2.nonzero()[0].sum()
            opfHash = l1outputOPF.nonzero()[0].sum()

            self.assertEqual(opfHash2, opfHash)
Example #26
0
    def testSaveAndReload(self):
        """
    This function tests saving and loading. It will train a network for 500
    iterations, then save it and reload it as a second network instance. It will
    then run both networks for 100 iterations and ensure they return identical
    results.
    """

        print "Creating network..."

        netOPF = _createOPFNetwork()
        level1OPF = netOPF.regions["level1SP"]

        # ==========================================================================
        print "Training network for 500 iterations"
        level1OPF.setParameter("learningMode", 1)
        level1OPF.setParameter("inferenceMode", 0)
        netOPF.run(500)
        level1OPF.setParameter("learningMode", 0)
        level1OPF.setParameter("inferenceMode", 1)

        # ==========================================================================
        # Save network and reload as a second instance. We need to reset the data
        # source for the unsaved network so that both instances start at the same
        # place
        print "Saving and reload network"
        _, tmpNetworkFilename = _setupTempDirectory("trained.nta")
        netOPF.save(tmpNetworkFilename)
        netOPF2 = Network(tmpNetworkFilename)
        level1OPF2 = netOPF2.regions["level1SP"]

        sensor = netOPF.regions["sensor"].getSelf()
        trainFile = findDataset("extra/gym/gym.csv")
        sensor.dataSource = FileRecordStream(streamID=trainFile)
        sensor.dataSource.setAutoRewind(True)

        # ==========================================================================
        print "Running inference on the two networks for 100 iterations"
        for _ in xrange(100):
            netOPF2.run(1)
            netOPF.run(1)
            l1outputOPF2 = level1OPF2.getOutputData("bottomUpOut")
            l1outputOPF = level1OPF.getOutputData("bottomUpOut")
            opfHash2 = l1outputOPF2.nonzero()[0].sum()
            opfHash = l1outputOPF.nonzero()[0].sum()

            self.assertEqual(opfHash2, opfHash)
def classifyAnomaliesAutomatically():
  """
  In this function we use the automatic labeling feature. Here we can set an
  anomaly threshold. Any record whose anomaly score goes above the threshold
  is automatically sent to the classifier. Any later record that matches the
  pattern will get labeled as "Auto Threshold Classification (auto)"
  """
  model = createModel()
  model.enableInference({'predictedField': 'sinx'})

  # Setup the classifier to automatically classify records with
  # anomaly score >= 0.9
  classifierRegion = model._getAnomalyClassifier()
  classifierRegion.setParameter('anomalyThreshold',0.9)
  print "threshold for classifying anomalies is:", (
    classifierRegion.getParameter('anomalyThreshold'))

  with open (findDataset(_DATA_PATH)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    csvWriter = csv.writer(open(_OUTPUT_PATH,"wb"))
    csvWriter.writerow(["x", "sinx", "anomaly_score", "anomalyLabel"])
    for i, record in enumerate(reader, start=1):
      modelInput = dict(zip(headers, record))
      modelInput["sinx"] = float(modelInput["sinx"])
      result = model.run(modelInput)
      anomalyScore = result.inferences['anomalyScore']
      anomalyLabel = result.inferences['anomalyLabel']

      # Convert the anomaly label into either 0 or 1
      if anomalyLabel == "[]":
        anomalyLabel = 0
      elif anomalyLabel == "['Auto Threshold Classification']":
        anomalyLabel = 1.0
      elif anomalyLabel == "['Auto Threshold Classification (auto)']":
        anomalyLabel = 1.0
      csvWriter.writerow([i, modelInput["sinx"], anomalyScore, anomalyLabel])

      if i>500 and anomalyScore > _ANOMALY_THRESHOLD:
        print "Anomaly detected at row [%d]. Anomaly score: %f." %(i,
                                                                 anomalyScore)

  print "Anomaly scores have been written to",_OUTPUT_PATH
  print "The following labels were stored in the classifier:"
  labels = eval(classifierRegion.executeCommand(["getLabels"]))
  pprint.pprint(labels)
Example #28
0
def runHotgym():
    model = createModel()
    model.enableInference({'predictedField': 'consumption'})
    metricsManager = MetricsManager(METRIC_SPECS, model.getFieldInfo(),
                                    model.getInferenceType())
    with open(findDataset(DATA_PATH)) as fin:
        reader = csv.reader(fin)
        headers = reader.next()
        reader.next()
        reader.next()
        for record in reader:
            modelInput = dict(zip(headers, record))
            modelInput["consumption"] = float(modelInput["consumption"])
            modelInput["timestamp"] = datetime.datetime.strptime(
                modelInput["timestamp"], "%Y-%m-%d %H:%M:%S.%f")
            result = model.run(modelInput)
            result.metrics = metricsManager.update(result)
            print result
Example #29
0
def runHotgym():
  model = createModel()
  model.enableInference({'predictedField': 'Volume'})
  metricsManager = MetricsManager(_METRIC_SPECS, model.getFieldInfo(),
                                  model.getInferenceType())
  with open (findDataset(_DATA_PATH)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    for i, record in enumerate(reader, start=1):
      modelInput = dict(zip(headers, record))
      modelInput["Volume"] = float(modelInput["Volume"])
      result = model.run(modelInput)
      result.metrics = metricsManager.update(result)
      isLast = i == _NUM_RECORDS
      if i % 100 == 0 or isLast:
        print result.metrics
      if isLast:
        break
Example #30
0
def computeClassificationAccuracy(resultFile, trainingSetSize):
  numErrors = 0.0
  numRecords = 0.0
  with open (findDataset(resultFile)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    for i, record in enumerate(reader):
      if numRecords >= trainingSetSize:
        data = dict(zip(headers, record))
  
        if data['predictedLabel'] != data['trueLabel']:
            numErrors += 1.0
            print "=> Incorrectly predicted record at line %s." %i
            print "   True Label: %s. Predicted Label: %s" %(data['trueLabel'], data['predictedLabel']) 
      
      numRecords += 1.0
          
  # classification accuracy          
  return (1-numErrors/numRecords) * 100
Example #31
0
def train():
  model.enableInference({'predictedField': 'hostname'})

  with open (findDataset(_DATA_PATH)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    # Skip header lines
    reader.next()
    reader.next()

    i = 0
    for i, record in enumerate(reader, start=1):
      modelInput = dict(zip(headers, record))
      model.run(modelInput)
      i += 1
      if (i%500 == 0):
        print("ran "+ str(i) + " steps")

    model.save(os.path.join(_OUTPUT_PATH, "checkpoint"))
def computeClassificationAccuracy(resultFile):
  numErrors = 0.0
  numTestRecords = 0.0
  with open(findDataset(resultFile)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    for i, record in enumerate(reader):
      if i >= CLASSIFIER_TRAINING_SET_SIZE:
        data = dict(zip(headers, record))

        if data['predictedLabel'] != data['trueLabel']:
          numErrors += 1.0
          print "=> Incorrectly predicted record at line %s." % i
          print "   True Label: %s. Predicted Label: %s" % (data['trueLabel'], data['predictedLabel'])

      numTestRecords += 1.0

  # classification accuracy          
  return 100 * (1 - numErrors / numTestRecords)
Example #33
0
    def test_GymAggregate(self):
        filename = findDataset('extra/gym/gym.csv')

        input = []

        gymFields = None

        with FileRecordStream(filename) as f:
            gymFields = f.getFields()
            for i in range(10):
                input.append(f.getNextRecord())

        for h in (1, 3):
            aggregationOptions = dict(fields=[(
                'timestamp',
                lambda x: x[0],
            ), ('attendeeCount', sum), ('consumption', sum)],
                                      hours=h)


            handle = \
              tempfile.NamedTemporaryFile(prefix='test',
                suffix='.bin')
            outputFile = handle.name
            handle.close()

            dataInput = DataInputList(input, gymFields)
            dataOutput = DataOutputMyFile(
                FileRecordStream(outputFile, write=True, fields=gymFields))

            _aggregate(input=dataInput,
                       options=aggregationOptions,
                       timeFieldName='timestamp',
                       output=dataOutput)

            dataOutput.close()

            for r in FileRecordStream(outputFile):
                print r
            print '-' * 30

        return
Example #34
0
  def test_GymAggregate(self):
    filename = findDataset('extra/gym/gym.csv')

    input = []

    gymFields = None

    with FileRecordStream(filename) as f:
      gymFields = f.getFields()
      for i in range(10):
        input.append(f.getNextRecord())

    for h in (1,3):
      aggregationOptions = dict(
        fields=[
          ('timestamp', lambda x: x[0],),
          ('attendeeCount', sum),
          ('consumption', sum)],
        hours=h
      )


      handle = \
        tempfile.NamedTemporaryFile(prefix='test', 
          suffix='.bin')
      outputFile = handle.name
      handle.close()
      
      dataInput = DataInputList(input, gymFields)
      dataOutput = DataOutputMyFile(FileRecordStream(outputFile, write=True,
                                                     fields=gymFields))

      _aggregate(input=dataInput, options=aggregationOptions, 
                 timeFieldName='timestamp', output=dataOutput)

      dataOutput.close()

      for r in FileRecordStream(outputFile):
        print r
      print '-' * 30

    return
Example #35
0
def trainAndClassify(trainingSetSize, model, data_path, results_path):
  """
  In this function we explicitly label specific portions of the data stream.
  Any later record that matches the pattern will get labeled the same way.
  """

  model.enableInference({'predictedField': 'y'})

  # Here we will get the classifier instance so we can add and query labels.
  classifierRegion = model._getAnomalyClassifier()
  classifierRegionPy = classifierRegion.getSelf()

  # We need to set this classification type. It is supposed to be the default
  # but is not for some reason.
  classifierRegionPy.classificationVectorType = 2

  with open (findDataset(data_path)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    csvWriter = csv.writer(open(results_path,"wb"))
    csvWriter.writerow(["x", "y", "trueLabel", "anomalyScore", "predictedLabel"])
    for x, record in enumerate(reader):
      modelInput = dict(zip(headers, record))
      modelInput["y"] = float(modelInput["y"])
      trueLabel = modelInput["label"]
      result = model.run(modelInput)
      anomalyScore = result.inferences['anomalyScore']
      predictedLabel = result.inferences['anomalyLabel'][2:-2]

      if x < 1000: # wait until the SP has seen the 3 classes at lest one time
        csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, 'NOT_READY'])
      
      elif 1000 <= x < trainingSetSize: # relabel predictions (i.e. train KNN)
        csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, 'TRAINING'])
        classifierRegion.executeCommand(["addLabel", str(x), str(x + 1), trueLabel])

      elif x>= trainingSetSize: # predict
        csvWriter.writerow([x, modelInput["y"], trueLabel, anomalyScore, predictedLabel])

      

  print "Results have been written to %s" % results_path
def computeClassificationAccuracy(result_file):
  false_positive = 0
  false_negative = 0
  
  with open (findDataset(result_file)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    for i, record in enumerate(reader):
      data = dict(zip(headers, record))

      if data['predictedLabel'] == "label1" and data['trueLabel'] != "label1":
          false_positive +=1
          #print "False positive: %s, %s, %s" % (i, data['anomaly'], data['anomalyLabel']) 
      if data['predictedLabel'] == "label0" and data['trueLabel'] != "label0":
          false_negative +=1 
          #print "False negative: %s, %s, %s" % (i, data['anomaly'], data['anomalyLabel']) 
          
          
  print ""
  print "== Classification accuracy for %s ==" % resultsPath
  print "* False positive: %s" % false_positive
  print "* False negative: %s" % false_negative
  print ""
Example #37
0
def runHotgymAnomaly():
  model = createModel()
  model.enableInference({'predictedField': 'consumption'})
  with open (findDataset(_DATA_PATH)) as fin:
    reader = csv.reader(fin)
    csvWriter = csv.writer(open(_OUTPUT_PATH,"wb"))
    csvWriter.writerow(["timestamp", "consumption", "anomaly_score"])
    headers = reader.next()
    reader.next()
    reader.next()
    for i, record in enumerate(reader, start=1):
      modelInput = dict(zip(headers, record))
      modelInput["consumption"] = float(modelInput["consumption"])
      modelInput["timestamp"] = datetime.datetime.strptime(
          modelInput["timestamp"], "%m/%d/%y %H:%M")
      result = model.run(modelInput)
      anomalyScore = result.inferences['anomalyScore']
      csvWriter.writerow([modelInput["timestamp"], modelInput["consumption"],
                          anomalyScore])
      if anomalyScore > _ANOMALY_THRESHOLD:
        _LOGGER.info("Anomaly detected at [%s]. Anomaly score: %f.",
                      result.rawInput["timestamp"], anomalyScore)

  print "Anomaly scores have been written to",_OUTPUT_PATH
Example #38
0
def runHotgym():
    models = []
    previousPredictions = []
    bestPrediction = 0.0
    lstPrediction = 0.0

    # Setup all the models. Here each model has a different SP seed.
    for i in range(14):
        models.append(createModel(1956 + i))

    for m in models:
        setupModel(m)
        previousPredictions.append(0.0)

    # The best least squares predictor. Initialize with 1.0/len(models)
    bestFit = numpy.ones(len(models)) / len(models)

    print "Running ensemble with", len(
        models), "models. This could take a while!"

    # Matrix to hold the last month's worth of predictions and actuals
    lstNumRows = 2000
    a = numpy.zeros((lstNumRows, len(models)))
    b = numpy.zeros(lstNumRows)

    with open("output.csv", "wb") as outputFile:
        csvWriter = csv.writer(outputFile)
        csvWriter.writerow(["timestamp", "consumption", "predictions"])
        with open(findDataset(_DATA_PATH)) as fin:
            reader = csv.reader(fin)
            headers = reader.next()
            reader.next()
            reader.next()
            for i, record in enumerate(reader, start=1):

                # Prepare input dict for feeding each model
                modelInput = dict(zip(headers, record))
                modelInput["consumption"] = float(modelInput["consumption"])
                modelInput["timestamp"] = datetime.datetime.strptime(
                    modelInput["timestamp"], "%m/%d/%y %H:%M")

                # Run each model and get each prediction and running sum
                results = []
                predictions = []
                sum = 0
                lstSum = 0
                for k, m in enumerate(models):
                    result = m.run(modelInput)
                    prediction = result.inferences["multiStepBestPredictions"][
                        1]
                    results.append(result)
                    predictions.append(prediction)
                    sum += prediction
                    lstSum += bestFit[k] * prediction

                # # Write results to the output CSV file
                if i > 1:
                    row = [modelInput["timestamp"], modelInput["consumption"]]
                    row.extend(previousPredictions)
                    row.append(bestPrediction)
                    row.append(lstPrediction)
                    csvWriter.writerow(row)

                    # Keep a rolling store of the last lstNumRows of predictions and
                    # actuals
                    a[i % lstNumRows] = previousPredictions
                    b[i % lstNumRows] = modelInput["consumption"]
                    # Redo the least squares estimate on the last lstNumRows every week
                    if (i > 300 + lstNumRows) and (i % 24 * 7 == 0):
                        print "Iteration: %d, doing least squares fit using " \
                              "the last %d predictions!" % (i,lstNumRows)
                        x = numpy.linalg.lstsq(a, b)
                        bestFit = x[0]
                        # Print the weights and the average residual squared error
                        print bestFit, x[1][0] / lstNumRows

                # Compute best prediction (to be used next timestamp)
                # Save current predictions for later output. This shifts the
                # predictions so that they are lined up with the timestamps they are
                # actually predicting.
                previousPredictions = copy.deepcopy(predictions)
                bestPrediction = sum / len(models)
                lstPrediction = lstSum

                if i % 200 == 0: print "iteration:", i
                if i == _NUM_RECORDS:
                    break
Example #39
0
def _createOPFNetwork(addSP=True, addTP=False):
    """Create a 'new-style' network ala OPF and return it.
  If addSP is true, an SPRegion will be added named 'level1SP'.
  If addTP is true, a TPRegion will be added named 'level1TP'
  """

    # ==========================================================================
    # Create the encoder and data source stuff we need to configure the sensor
    sensorParams = dict(verbosity=_VERBOSITY)
    encoder = _createEncoder()
    trainFile = findDataset("extra/gym/gym.csv")
    dataSource = FileRecordStream(streamID=trainFile)
    dataSource.setAutoRewind(True)

    # ==========================================================================
    # Now create the network itself
    n = Network()
    n.addRegion("sensor", "py.RecordSensor", json.dumps(sensorParams))

    sensor = n.regions['sensor'].getSelf()
    sensor.encoder = encoder
    sensor.dataSource = dataSource

    # ==========================================================================
    # Add the SP if requested
    if addSP:
        print "Adding SPRegion"
        g_spRegionConfig['inputWidth'] = encoder.getWidth()
        n.addRegion("level1SP", "py.SPRegion", json.dumps(g_spRegionConfig))

        n.link("sensor", "level1SP", "UniformLink", "")
        n.link("sensor",
               "level1SP",
               "UniformLink",
               "",
               srcOutput="resetOut",
               destInput="resetIn")
        n.link("level1SP",
               "sensor",
               "UniformLink",
               "",
               srcOutput="spatialTopDownOut",
               destInput="spatialTopDownIn")
        n.link("level1SP",
               "sensor",
               "UniformLink",
               "",
               srcOutput="temporalTopDownOut",
               destInput="temporalTopDownIn")

    # ==========================================================================
    if addTP and addSP:
        # Add the TP on top of SP if requested
        # The input width of the TP is set to the column count of the SP
        print "Adding TPRegion on top of SP"
        g_tpRegionConfig['inputWidth'] = g_spRegionConfig['columnCount']
        n.addRegion("level1TP", "py.TPRegion", json.dumps(g_tpRegionConfig))
        n.link("level1SP", "level1TP", "UniformLink", "")
        n.link("level1TP",
               "level1SP",
               "UniformLink",
               "",
               srcOutput="topDownOut",
               destInput="topDownIn")
        n.link("sensor",
               "level1TP",
               "UniformLink",
               "",
               srcOutput="resetOut",
               destInput="resetIn")

    elif addTP:
        # Add a lone TPRegion if requested
        # The input width of the TP is set to the encoder width
        print "Adding TPRegion"
        g_tpRegionConfig['inputWidth'] = encoder.getWidth()
        n.addRegion("level1TP", "py.TPRegion", json.dumps(g_tpRegionConfig))

        n.link("sensor", "level1TP", "UniformLink", "")
        n.link("sensor",
               "level1TP",
               "UniformLink",
               "",
               srcOutput="resetOut",
               destInput="resetIn")

    return n
Example #40
0
  def testDeltaFilter(self):
    """
    data looks like:        should generate deltas
      "t"   "s"               "dt"     "ds"

      t     10                 X
      t+1s  20                 1s      10
      t+1d  50                 86399   30

    r t+1d+1s  60              X
      r+1d+3s  65              2s       5

    """
    r = RecordSensor()
    filename = findDataset("extra/qa/delta.csv")
    datasource = FileRecordStream(filename)
    r.dataSource = datasource
    n = 50
    encoder = MultiEncoder({'blah': dict(fieldname="s", type='ScalarEncoder',
                                         n=n, w=11, minval=0, maxval=100)})

    r.encoder = encoder

    # Test #1 -- no deltas
    # Make sure we get a reset when the gym changes
    resetOut = numpy.zeros((1,), dtype='float')
    sequenceIdOut = numpy.zeros((1,), dtype='float')
    dataOut = numpy.zeros((n,), dtype='float')
    sourceOut = numpy.zeros((1,), dtype='float')
    categoryOut = numpy.zeros((1,), dtype='float')

    outputs = dict(resetOut=resetOut,
                   sourceOut = sourceOut,
                   sequenceIdOut = sequenceIdOut,
                   dataOut = dataOut,
                   categoryOut = categoryOut)
    inputs = dict()
    r.verbosity=0

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16,
                                       minute=8, second=0))
    self.assertEqual(lr['s'], 10)
    self.assertEqual(lr['_reset'], 1)
    self.assertTrue('dt' not in lr)
    self.assertTrue('ds' not in lr)

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16,
                                       minute=8, second=1))
    self.assertEqual(lr['s'], 20)
    self.assertEqual(lr['_reset'], 0)

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16,
                                       minute=8, second=0))
    self.assertEqual(lr['s'], 50)
    self.assertEqual(lr['_reset'], 0)

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16,
                                       minute=8, second=1))
    self.assertEqual(lr['s'], 60)
    self.assertEqual(lr['_reset'], 1)

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16,
                                       minute=8, second=3))
    self.assertEqual(lr['s'], 65)
    self.assertEqual(lr['_reset'], 0)

    # Add filters

    r.preEncodingFilters = [DeltaFilter("s", "ds"), DeltaFilter("t", "dt")]
    r.rewind()

    # skip first record, which has a reset

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16,
                                       minute=8, second=1))
    self.assertEqual(lr['s'], 20)
    self.assertEqual(lr['_reset'], 1)  # this record should have a reset since
                                       # it is first of a sequence
    self.assertEqual(lr['dt'], 1)
    self.assertEqual(lr['ds'], 10)

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16,
                                       minute=8, second=0))
    self.assertEqual(lr['s'], 50)
    self.assertEqual(lr['_reset'], 0)
    self.assertEqual(lr['dt'], 3600 * 24 - 1)
    self.assertEqual(lr['ds'], 30)

    # next reset record is skipped

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16,
                                       minute=8, second=3))
    self.assertEqual(lr['s'], 65)
    self.assertEqual(lr['_reset'], 1)
    self.assertEqual(lr['dt'], 2)
    self.assertEqual(lr['ds'], 5)
Example #41
0
def generateStats(filename, maxSamples = None,):
  """
  Collect statistics for each of the fields in the user input data file and
  return a stats dict object.

  Parameters:
  ------------------------------------------------------------------------------
  filename:             The path and name of the data file.
  maxSamples:           Upper bound on the number of rows to be processed
  retval:               A dictionary of dictionaries. The top level keys are the
                        field names and the corresponding values are the statistics
                        collected for the individual file.
                        Example:
                        {
                          'consumption':{'min':0,'max':90,'mean':50,...},
                          'gym':{'numDistinctCategories':10,...},
                          ...
                         }


  """
  # Mapping from field type to stats collector object
  statsCollectorMapping = {'float':    FloatStatsCollector,
                           'int':      IntStatsCollector,
                           'string':   StringStatsCollector,
                           'datetime': DateTimeStatsCollector,
                           'bool':     BoolStatsCollector,
                           }

  filename = findDataset(filename)
  print "*"*40
  print "Collecting statistics for file:'%s'" % (filename,)
  dataFile = FileRecordStream(filename)

  # Initialize collector objects
  # statsCollectors list holds statsCollector objects for each field
  statsCollectors = []
  for fieldName, fieldType, fieldSpecial in dataFile.getFields():
    # Find the corresponding stats collector for each field based on field type
    # and intialize an instance
    statsCollector = \
            statsCollectorMapping[fieldType](fieldName, fieldType, fieldSpecial)
    statsCollectors.append(statsCollector)

  # Now collect the stats
  if maxSamples is None:
    maxSamples = 500000
  for i in xrange(maxSamples):
    record = dataFile.getNextRecord()
    if record is None:
      break
    for i, value in enumerate(record):
      statsCollectors[i].addValue(value)

  # stats dict holds the statistics for each field
  stats = {}
  for statsCollector in statsCollectors:
    statsCollector.getStats(stats)

  # We don't want to include reset field in permutations
  # TODO: handle reset field in a clean way
  if dataFile.getResetFieldIdx() is not None:
    resetFieldName,_,_ = dataFile.getFields()[dataFile.reset]
    stats.pop(resetFieldName)

  if VERBOSITY > 0:
    pprint.pprint(stats)

  return stats
Example #42
0
def generateDataset(aggregationInfo, inputFilename, outputFilename=None):
  """Generate a dataset of aggregated values

  Parameters:
  ----------------------------------------------------------------------------
  aggregationInfo: a dictionary that contains the following entries
    - fields: a list of pairs. Each pair is a field name and an
      aggregation function (e.g. sum). The function will be used to aggregate
      multiple values during the aggregation period.

  aggregation period: 0 or more of unit=value fields; allowed units are:
        [years months] |
        [weeks days hours minutes seconds milliseconds microseconds]
        NOTE: years and months are mutually-exclusive with the other units.
              See getEndTime() and _aggregate() for more details.
        Example1: years=1, months=6,
        Example2: hours=1, minutes=30,
        If none of the period fields are specified or if all that are specified
        have values of 0, then aggregation will be suppressed, and the given
        inputFile parameter value will be returned.

  inputFilename: filename (or relative path form NTA_DATA_PATH) of
               the input dataset
               
  outputFilename: name for the output file. If not given, a name will be
        generated based on the input filename and the aggregation params
        
  retval: Name of the generated output file. This will be the same as the input
      file name if no aggregation needed to be performed
        
  

  If the input file contained a time field, sequence id field or reset field
  that were not specified in aggregationInfo fields, those fields will be
  added automatically with the following rules:

  1. The order will be R, S, T, rest of the fields
  2. The aggregation function for all will be to pick the first: lambda x: x[0]

    Returns: the path of the aggregated data file if aggregation was performed
      (in the same directory as the given input file); if aggregation did not
      need to be performed, then the given inputFile argument value is returned.
  """



  # Create the input stream
  inputFullPath = findDataset(inputFilename)
  inputObj = FileRecordStream(inputFullPath)
  

  # Instantiate the aggregator
  aggregator = Aggregator(aggregationInfo=aggregationInfo, 
                          inputFields=inputObj.getFields())
  
  
  # Is it a null aggregation? If so, just return the input file unmodified
  if aggregator.isNullAggregation():
    return inputFullPath


  # ------------------------------------------------------------------------
  # If we were not given an output filename, create one based on the 
  #  aggregation settings
  if outputFilename is None:
    outputFilename = 'agg_%s' % \
                        os.path.splitext(os.path.basename(inputFullPath))[0]
    timePeriods = 'years months weeks days '\
                  'hours minutes seconds milliseconds microseconds'
    for k in timePeriods.split():
      if aggregationInfo.get(k, 0) > 0:
        outputFilename += '_%s_%d' % (k, aggregationInfo[k])
  
    outputFilename += '.csv'
    outputFilename = os.path.join(os.path.dirname(inputFullPath), outputFilename)



  # ------------------------------------------------------------------------
  # If some other process already started creating this file, simply 
  #   wait for it to finish and return without doing anything
  lockFilePath = outputFilename + '.please_wait'
  if os.path.isfile(outputFilename) or \
     os.path.isfile(lockFilePath):
    while os.path.isfile(lockFilePath):
      print 'Waiting for %s to be fully written by another process' % \
            lockFilePath
      time.sleep(1)
    return outputFilename


  # Create the lock file
  lockFD = open(lockFilePath, 'w')



  # -------------------------------------------------------------------------
  # Create the output stream
  outputObj = FileRecordStream(streamID=outputFilename, write=True,
                               fields=inputObj.getFields())


  # -------------------------------------------------------------------------
  # Write all aggregated records to the output
  while True:
    inRecord = inputObj.getNextRecord()
    
    (aggRecord, aggBookmark) = aggregator.next(inRecord, None)
    
    if aggRecord is None and inRecord is None:
      break
    
    if aggRecord is not None:
      outputObj.appendRecord(aggRecord)

  return outputFilename
Example #43
0
def runHotgym():
    model = createModel()
    model.enableInference({'predictedField': 'consumption'})
    metricsManager = MetricsManager(_METRIC_SPECS, model.getFieldInfo(),
                                    model.getInferenceType())

    # The shifter will align prediction and actual values.
    shifter = InferenceShifter()

    # Keep the last WINDOW predicted and actual values for plotting.
    actHistory = deque([0.0] * WINDOW, maxlen=60)
    predHistory = deque([0.0] * WINDOW, maxlen=60)

    # Initialize the plot lines that we will update with each new record.
    actline, = plt.plot(range(WINDOW), actHistory)
    predline, = plt.plot(range(WINDOW), predHistory)
    # Set the y-axis range.
    actline.axes.set_ylim(0, 100)
    predline.axes.set_ylim(0, 100)

    with open(findDataset(_DATA_PATH)) as fin:
        reader = csv.reader(fin)
        headers = reader.next()
        reader.next()
        reader.next()
        for i, record in enumerate(reader, start=1):
            modelInput = dict(zip(headers, record))
            modelInput["consumption"] = float(modelInput["consumption"])
            modelInput["timestamp"] = datetime.datetime.strptime(
                modelInput["timestamp"], "%m/%d/%y %H:%M")
            result = model.run(modelInput)

            shifted_result = shifter.shift(result)

            # Update the trailing predicted and actual value deques.
            inference = shifted_result.inferences['multiStepBestPredictions'][
                1]
            if inference is not None:
                actHistory.append(shifted_result.rawInput['consumption'])
                predHistory.append(inference)

            # Redraw the chart with the new data.
            actline.set_ydata(actHistory)  # update the data
            predline.set_ydata(predHistory)  # update the data
            plt.draw()
            plt.legend(('actual', 'predicted'))

            # Make sure we wait a total of SECONDS_PER_STEP seconds per iteration.
            try:
                plt.pause(SECONDS_PER_STEP)
            except:
                pass

            result.metrics = metricsManager.update(result)
            isLast = i == _NUM_RECORDS
            if i % 100 == 0 or isLast:
                _LOGGER.info(
                    "After %i records, 1-step altMAPE=%f", i,
                    result.metrics["multiStepBestPredictions:multiStep:"
                                   "errorMetric='altMAPE':steps=1:window=1000:"
                                   "field=consumption"])
            if isLast:
                break
Example #44
0
File: utils.py Project: zacg/nupic
def findAllDatasets(datasets):
    """Find all datasets in a dataset dictionary"""
    d = dict()
    for key in datasets:
        d[key] = findDataset(datasets[key])
    return d
Example #45
0
    def testDeltaFilter(self):
        """
    data looks like:        should generate deltas
      "t"   "s"               "dt"     "ds"

      t     10                 X
      t+1s  20                 1s      10
      t+1d  50                 86399   30

    r t+1d+1s  60              X
      r+1d+3s  65              2s       5

    """
        r = RecordSensor()
        filename = findDataset("extra/qa/delta.csv")
        datasource = FileRecordStream(filename)
        r.dataSource = datasource
        n = 50
        encoder = MultiEncoder({
            'blah':
            dict(fieldname="s",
                 type='ScalarEncoder',
                 n=n,
                 w=11,
                 minval=0,
                 maxval=100)
        })

        r.encoder = encoder

        # Test #1 -- no deltas
        # Make sure we get a reset when the gym changes
        resetOut = numpy.zeros((1, ), dtype='float')
        sequenceIdOut = numpy.zeros((1, ), dtype='float')
        dataOut = numpy.zeros((n, ), dtype='float')
        sourceOut = numpy.zeros((1, ), dtype='float')
        categoryOut = numpy.zeros((1, ), dtype='float')

        outputs = dict(resetOut=resetOut,
                       sourceOut=sourceOut,
                       sequenceIdOut=sequenceIdOut,
                       dataOut=dataOut,
                       categoryOut=categoryOut)
        inputs = dict()
        r.verbosity = 0

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=24, hour=16, minute=8, second=0))
        self.assertEqual(lr['s'], 10)
        self.assertEqual(lr['_reset'], 1)
        self.assertTrue('dt' not in lr)
        self.assertTrue('ds' not in lr)

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1))
        self.assertEqual(lr['s'], 20)
        self.assertEqual(lr['_reset'], 0)

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0))
        self.assertEqual(lr['s'], 50)
        self.assertEqual(lr['_reset'], 0)

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=25, hour=16, minute=8, second=1))
        self.assertEqual(lr['s'], 60)
        self.assertEqual(lr['_reset'], 1)

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3))
        self.assertEqual(lr['s'], 65)
        self.assertEqual(lr['_reset'], 0)

        # Add filters

        r.preEncodingFilters = [DeltaFilter("s", "ds"), DeltaFilter("t", "dt")]
        r.rewind()

        # skip first record, which has a reset

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1))
        self.assertEqual(lr['s'], 20)
        self.assertEqual(lr['_reset'],
                         1)  # this record should have a reset since
        # it is first of a sequence
        self.assertEqual(lr['dt'], 1)
        self.assertEqual(lr['ds'], 10)

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0))
        self.assertEqual(lr['s'], 50)
        self.assertEqual(lr['_reset'], 0)
        self.assertEqual(lr['dt'], 3600 * 24 - 1)
        self.assertEqual(lr['ds'], 30)

        # next reset record is skipped

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3))
        self.assertEqual(lr['s'], 65)
        self.assertEqual(lr['_reset'], 1)
        self.assertEqual(lr['dt'], 2)
        self.assertEqual(lr['ds'], 5)
def classifyAnomaliesManually():
  """
  In this function we explicitly label specific portions of the data stream that
  we happen to know are anomalous. Any later record that matches the pattern
  will get labeled as "myAnomaly"
  """
  model = createModel()
  model.enableInference({'predictedField': 'sinx'})

  # Here we will get the classifier instance so we can add and query labels.
  classifierRegion = model._getAnomalyClassifier()
  classifierRegionPy = classifierRegion.getSelf()

  # We need to set this classification type. It is supposed to be the default
  # but is not for some reason.
  classifierRegionPy.classificationVectorType = 2

  with open (findDataset(_DATA_PATH)) as fin:
    reader = csv.reader(fin)
    headers = reader.next()
    csvWriter = csv.writer(open(_OUTPUT_PATH,"wb"))
    csvWriter.writerow(["x", "sinx", "anomaly_score", "anomalyLabel"])
    for i, record in enumerate(reader, start=1):
      modelInput = dict(zip(headers, record))
      modelInput["sinx"] = float(modelInput["sinx"])
      result = model.run(modelInput)
      anomalyScore = result.inferences['anomalyScore']
      anomalyLabel = result.inferences['anomalyLabel']

      # Convert the anomaly label into either 0 or 1
      if anomalyLabel == "[]":
        anomalyLabel = 0
      elif anomalyLabel == "['myAnomaly']":
        anomalyLabel = 1.0
        print "Anomaly detected at record",i

      # Manually tell the classifier to learn the first few artificial
      # anomalies. From there it should catch many of the following
      # anomalies, even though the anomaly sore might be low.
      if i == 2505:
        print "Adding labeled anomalies for record",i
        classifierRegion.executeCommand(["addLabel","2498","2503","myAnomaly"])
        anomalyLabel = 1.0

      if i == 2605:
        print "Adding labeled anomalies for record",i
        classifierRegion.executeCommand(["addLabel","2598","2603","myAnomaly"])
        anomalyLabel = 1.0

      if i == 2705:
        print "Adding labeled anomalies for record",i
        classifierRegion.executeCommand(["addLabel","2698","2703","myAnomaly"])
        anomalyLabel = 1.0

      csvWriter.writerow([i, modelInput["sinx"], anomalyScore, anomalyLabel])


  print "Anomaly scores have been written to",_OUTPUT_PATH
  print "The following labels were stored in the classifier:"
  labels = eval(classifierRegion.executeCommand(["getLabels"]))
  pprint.pprint(labels)
Example #47
0
        "bottomUpOut").nonzero()[0]

    # Calculate the anomaly score using the active columns
    # and previous predicted columns
    anomalyScore = computeRawAnomalyScore(activeColumns, prevPredictedColumns)

    # Write out the anomaly score along with the record number and consumption
    # value.
    consumption = sensorRegion.getOutputData("sourceOut")[0]
    writer.writerow((i, consumption, anomalyScore))

    # Store the predicted columns for the next timestep
    predictedColumns = temporalPoolerRegion.getOutputData(
        "topDownOut").nonzero()[0]
    prevPredictedColumns = copy.deepcopy(predictedColumns)

    i += 1



if __name__ == "__main__":
  trainFile = findDataset(_DATA_PATH)
  dataSource = FileRecordStream(streamID=trainFile)

  network = createNetwork(dataSource)
  outputPath = os.path.join(os.path.dirname(__file__), _OUTPUT_PATH)
  with open(outputPath, "w") as outputFile:
    writer = csv.writer(outputFile)
    print "Writing output to %s" % outputPath
    runNetwork(network, writer)
Example #48
0
            "bottomUpOut").nonzero()[0]

        # Calculate the anomaly score using the active columns
        # and previous predicted columns
        anomalyScore = computeRawAnomalyScore(activeColumns,
                                              prevPredictedColumns)

        # Write out the anomaly score along with the record number and consumption
        # value.
        consumption = sensorRegion.getOutputData("sourceOut")[0]
        writer.writerow((i, consumption, anomalyScore))

        # Store the predicted columns for the next timestep
        predictedColumns = temporalPoolerRegion.getOutputData(
            "topDownOut").nonzero()[0]
        prevPredictedColumns = copy.deepcopy(predictedColumns)

        i += 1


if __name__ == "__main__":
    trainFile = findDataset(_DATA_PATH)
    dataSource = FileRecordStream(streamID=trainFile)

    network = createNetwork(dataSource)
    outputPath = os.path.join(os.path.dirname(__file__), _OUTPUT_PATH)
    with open(outputPath, "w") as outputFile:
        writer = csv.writer(outputFile)
        print "Writing output to %s" % outputPath
        runNetwork(network, writer)
Example #49
0
def findAllDatasets(datasets):
  """Find all datasets in a dataset dictionary"""
  d = dict()
  for key in datasets:
    d[key] = findDataset(datasets[key])
  return d
Example #50
0
def generateStats(filename, statsInfo, maxSamples = None, filters=[], cache=True):
  """Generate requested statistics for a dataset and cache to a file.
  If filename is None, then don't cache to a file"""

  # Sanity checking
  if not isinstance(statsInfo, dict):
    raise RuntimeError("statsInfo must be a dict -- "
                       "found '%s' instead" % type(statsInfo))

  filename = findDataset(filename)

  if cache:
    statsFilename = getStatsFilename(filename, statsInfo, filters)
    # Use cached stats if found AND if it has the right data
    if os.path.exists(statsFilename):
      try:
        r = pickle.load(open(statsFilename, "rb"))
      except:
        # Ok to ignore errors -- we will just re-generate the file
        print "Warning: unable to load stats for %s -- " \
              "will regenerate" % filename
        r = dict()
      requestedKeys = set([s for s in statsInfo])
      availableKeys = set(r.keys())
      unavailableKeys = requestedKeys.difference(availableKeys)
      if len(unavailableKeys ) == 0:
        return r
      else:
        print "generateStats: re-generating stats file %s because " \
              "keys %s are not available" %  \
              (filename, str(unavailableKeys))
        os.remove(filename)

  print "Generating statistics for file '%s' with filters '%s'" % (filename, filters)
  sensor = RecordSensor()
  sensor.dataSource = FileRecordStream(filename)
  sensor.preEncodingFilters = filters

  # Convert collector description to collector object
  stats = []
  for field in statsInfo:
    # field = key from statsInfo
    if statsInfo[field] == "number":
      # This wants a field name e.g. consumption and the field type as the value
      statsInfo[field] = NumberStatsCollector()
    elif statsInfo[field] == "category":
      statsInfo[field] = CategoryStatsCollector()
    else:
      raise RuntimeError("Unknown stats type '%s' for field '%s'" % (statsInfo[field], field))

  # Now collect the stats
  if maxSamples is None:
    maxSamples = 500000
  for i in xrange(maxSamples):
    try:
      record = sensor.getNextRecord()
    except StopIteration:
      break
    for (name, collector) in statsInfo.items():
      collector.add(record[name])

  del sensor

  # Assemble the results and return
  r = dict()
  for (field, collector) in statsInfo.items():
    stats = collector.getStats()
    if field not in r:
      r[field] = stats
    else:
      r[field].update(stats)

  if cache:
    f = open(statsFilename, "wb")
    pickle.dump(r, f)
    f.close()
    # caller may need to know name of cached file
    r["_filename"] = statsFilename

  return r
Example #51
0
def runGeospatialAnomaly(dataPath, outputPath,
                         scale=False,
                         autoSequence=True,
                         useTimeEncoders=False,
                         verbose=False):

  model = createModel(useTimeEncoders, scale, verbose)

  with open (findDataset(dataPath)) as fin:
    reader = csv.reader(fin)
    csvWriter = csv.writer(open(outputPath,"wb"))
    csvWriter.writerow(["timestamp",
                       "longitude",
                       "latitude",
                       "speed",
                       "anomaly_score",
                       "new_sequence"])

    reader.next()
    reader.next()
    reader.next()

    lastTimestamp = None
    lastTrackName = None

    for _, record in enumerate(reader, start=1):
      trackName = record[0]
      timestamp = datetime.datetime.fromtimestamp(int(record[1]) / 1e3)
      longitude = float(record[2])
      latitude = float(record[3])
      speed = float(record[5])
      accuracy = float(record[7])

      if accuracy > ACCURACY_THRESHOLD:
        continue

      newSequence = False
      # Handle the automatic sequence creation
      if autoSequence:
        if lastTimestamp and (
          (timestamp - lastTimestamp).total_seconds() > INTERVAL_THRESHOLD):
          newSequence = True
      # Manual sequence resets depend on the track name
      else:
        if trackName != lastTrackName:
          newSequence = True

      lastTimestamp = timestamp
      lastTrackName = trackName

      if newSequence:
        if verbose:
          print "Starting new sequence..."
        model.resetSequenceStates()

      modelInput = {
        "vector": (longitude, latitude, speed)
      }

      if useTimeEncoders:
        modelInput["timestamp"] = timestamp

      result = model.run(modelInput)
      anomalyScore = result.inferences['anomalyScore']

      csvWriter.writerow([timestamp,
                          longitude,
                          latitude,
                          speed,
                          anomalyScore,
                          1 if newSequence else 0])
      if verbose:
        print "[{0}] - Anomaly score: {1}.".format(timestamp, anomalyScore)

  print "Anomaly scores have been written to {0}".format(outputPath)