Beispiel #1
0
 def testCopyOneRow(self):
   expectedOutput = ("Timestamp,Value\n"
                     "datetime,int\n"
                     "T,\n"
                     "2011-09-04 02:00:00.000000,1\n"
                     "2011-09-04 02:05:00.000000,2\n"
                     "2011-09-04 02:10:00.000000,2\n"
                     "2011-09-04 02:15:00.000000,3\n"
                     "2011-09-04 02:20:00.000000,4\n"
                     "2011-09-04 02:25:00.000000,5\n"
                     "2011-09-04 02:30:00.000000,6\n")
   mockInput = MagicMock(return_value=StringIO(self.sampleInput))
   output = StringIO()
   mockOutput = MagicMock(return_value=output)
   with patch("__builtin__.open", mockInput):
     inputFile = FileRecordStream("input_path")
     with patch("__builtin__.open", mockOutput):
       outputFile = FileRecordStream("output_path",
                                     fields=inputFile.getFields(),
                                     write=True)
       anomalyzer.copy(inputFile, outputFile, 1, 1, 1)
   result = output.getvalue()
   result = result.replace("\r\n", "\n")
   result = result.replace("\r", "\n")
   self.assertSequenceEqual(expectedOutput, result)
Beispiel #2
0
 def testCopyOneRow(self):
     expectedOutput = ("Timestamp,Value\n"
                       "datetime,int\n"
                       "T,\n"
                       "2011-09-04 02:00:00.000000,1\n"
                       "2011-09-04 02:05:00.000000,2\n"
                       "2011-09-04 02:10:00.000000,2\n"
                       "2011-09-04 02:15:00.000000,3\n"
                       "2011-09-04 02:20:00.000000,4\n"
                       "2011-09-04 02:25:00.000000,5\n"
                       "2011-09-04 02:30:00.000000,6\n")
     mockInput = MagicMock(return_value=StringIO(self.sampleInput))
     output = StringIO()
     mockOutput = MagicMock(return_value=output)
     with patch("__builtin__.open", mockInput):
         inputFile = FileRecordStream("input_path")
         with patch("__builtin__.open", mockOutput):
             outputFile = FileRecordStream("output_path",
                                           fields=inputFile.getFields(),
                                           write=True)
             anomalyzer.copy(inputFile, outputFile, 1, 1, 1)
     result = output.getvalue()
     result = result.replace("\r\n", "\n")
     result = result.replace("\r", "\n")
     self.assertSequenceEqual(expectedOutput, result)
Beispiel #3
0
 def testSample(self):
   mockInput = MagicMock(return_value=StringIO(self.sampleInput))
   output = StringIO()
   mockOutput = MagicMock(return_value=output)
   with patch("__builtin__.open", mockInput):
     inputFile = FileRecordStream("input_path")
     with patch("__builtin__.open", mockOutput):
       outputFile = FileRecordStream("output_path",
                                     fields=inputFile.getFields(),
                                     write=True)
       anomalyzer.sample(inputFile, outputFile, 1)
   result = StringIO(output.getvalue())
   result.next()
   result.next()
   result.next()
   reader = csv.reader(result)
   _, value = reader.next()
   self.assertIn(int(value), (1, 2, 3, 4, 5, 6))
   self.assertRaises(StopIteration, result.next)
Beispiel #4
0
 def testSample(self):
     mockInput = MagicMock(return_value=StringIO(self.sampleInput))
     output = StringIO()
     mockOutput = MagicMock(return_value=output)
     with patch("__builtin__.open", mockInput):
         inputFile = FileRecordStream("input_path")
         with patch("__builtin__.open", mockOutput):
             outputFile = FileRecordStream("output_path",
                                           fields=inputFile.getFields(),
                                           write=True)
             anomalyzer.sample(inputFile, outputFile, 1)
     result = StringIO(output.getvalue())
     result.next()
     result.next()
     result.next()
     reader = csv.reader(result)
     _, value = reader.next()
     self.assertIn(int(value), (1, 2, 3, 4, 5, 6))
     self.assertRaises(StopIteration, result.next)
Beispiel #5
0
  def _testSamePredictions(self, experiment, predSteps, checkpointAt,
                           predictionsFilename, additionalFields=None,
                           newSerialization=False):
    """ Test that we get the same predictions out from the following two
    scenarios:

    a_plus_b: Run the network for 'a' iterations followed by 'b' iterations
    a, followed by b: Run the network for 'a' iterations, save it, load it
                      back in, then run for 'b' iterations.

    Parameters:
    -----------------------------------------------------------------------
    experiment:   base directory of the experiment. This directory should
                    contain the following:
                        base.py
                        a_plus_b/description.py
                        a/description.py
                        b/description.py
                    The sub-directory description files should import the
                    base.py and only change the first and last record used
                    from the data file.
    predSteps:   Number of steps ahead predictions are for
    checkpointAt: Number of iterations that 'a' runs for.
                 IMPORTANT: This must match the number of records that
                 a/description.py runs for - it is NOT dynamically stuffed into
                 the a/description.py.
    predictionsFilename: The name of the predictions file that the OPF
                  generates for this experiment (for example
                  'DefaulTask.NontemporalMultiStep.predictionLog.csv')
    newSerialization: Whether to use new capnproto serialization.
    """

    # Get the 3 sub-experiment directories
    aPlusBExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "a_plus_b")
    aExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "a")
    bExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "b")

    # Run a+b
    args = self._createExperimentArgs(aPlusBExpDir,
                                      newSerialization=newSerialization)
    _aPlusBExp = runExperiment(args)

    # Run a, the copy the saved checkpoint into the b directory
    args = self._createExperimentArgs(aExpDir,
                                      newSerialization=newSerialization)
    _aExp = runExperiment(args)
    if os.path.exists(os.path.join(bExpDir, 'savedmodels')):
      shutil.rmtree(os.path.join(bExpDir, 'savedmodels'))
    shutil.copytree(src=os.path.join(aExpDir, 'savedmodels'),
                    dst=os.path.join(bExpDir, 'savedmodels'))

    args = self._createExperimentArgs(bExpDir,
                                      newSerialization=newSerialization,
                                      additionalArgs=['--load=DefaultTask'])
    _bExp = runExperiment(args)

    # Now, compare the predictions at the end of a+b to those in b.
    aPlusBPred = FileRecordStream(os.path.join(aPlusBExpDir, 'inference',
                                   predictionsFilename))
    bPred = FileRecordStream(os.path.join(bExpDir, 'inference',
                                   predictionsFilename))

    colNames = [x[0] for x in aPlusBPred.getFields()]
    actValueColIdx = colNames.index('multiStepPredictions.actual')
    predValueColIdx = colNames.index('multiStepPredictions.%d' % (predSteps))

    # Skip past the 'a' records in aPlusB
    for i in range(checkpointAt):
      aPlusBPred.next()

    # Now, read through the records that don't have predictions yet
    for i in range(predSteps):
      aPlusBPred.next()
      bPred.next()

    # Now, compare predictions in the two files
    rowIdx = checkpointAt + predSteps + 4 - 1
    epsilon = 0.0001
    while True:
      rowIdx += 1
      try:
        rowAPB = aPlusBPred.next()
        rowB = bPred.next()

        # Compare actuals
        self.assertEqual(rowAPB[actValueColIdx], rowB[actValueColIdx],
              "Mismatch in actual values: row %d of a+b has %s and row %d of "
              "b has %s" % (rowIdx, rowAPB[actValueColIdx], rowIdx-checkpointAt,
                            rowB[actValueColIdx]))

        # Compare predictions, within nearest epsilon
        predAPB = eval(rowAPB[predValueColIdx])
        predB = eval(rowB[predValueColIdx])

        # Sort with highest probabilities first
        predAPB = [(a, b) for b, a in predAPB.items()]
        predB = [(a, b) for b, a in predB.items()]
        predAPB.sort(reverse=True)
        predB.sort(reverse=True)

        if additionalFields is not None:
          for additionalField in additionalFields:
            fieldIdx = colNames.index(additionalField)
            self.assertEqual(rowAPB[fieldIdx], rowB[fieldIdx],
              "Mismatch in field \'%s\' values: row %d of a+b has value: (%s)\n"
              " and row %d of b has value: %s" % \
              (additionalField, rowIdx, rowAPB[fieldIdx],
                rowIdx-checkpointAt, rowB[fieldIdx]))

        self.assertEqual(len(predAPB), len(predB),
              "Mismatch in predicted values: row %d of a+b has %d predictions: "
              "\n  (%s) and row %d of b has %d predictions:\n  (%s)" % \
              (rowIdx, len(predAPB), predAPB, rowIdx-checkpointAt, len(predB),
               predB))

        for i in range(len(predAPB)):
          (aProb, aValue) = predAPB[i]
          (bProb, bValue) = predB[i]
          self.assertLess(abs(aValue-bValue), epsilon,
              "Mismatch in predicted values: row %d of a+b predicts value %s "
              "and row %d of b predicts %s" % (rowIdx, aValue,
                                               rowIdx-checkpointAt, bValue))
          self.assertLess(abs(aProb-bProb), epsilon,
              "Mismatch in probabilities: row %d of a+b predicts %s with "
              "probability %s and row %d of b predicts %s with probability %s" \
               % (rowIdx, aValue, aProb, rowIdx-checkpointAt, bValue, bProb))

      except StopIteration:
        break

    # clean up model checkpoint directories
    shutil.rmtree(getCheckpointParentDir(aExpDir))
    shutil.rmtree(getCheckpointParentDir(bExpDir))
    shutil.rmtree(getCheckpointParentDir(aPlusBExpDir))

    print "Predictions match!"
  def testExperimentResults(self):
    """Run specific experiments and verify that they are producing the correct
    results.

    opfDir is the examples/opf directory in the install path
    and is used to find run_opf_experiment.py

    The testdir is the directory that contains the experiments we will be
    running. When running in the auto-build setup, this will be a temporary
    directory that has had this script, as well as the specific experiments
    we will be running, copied into it by the qa/autotest/prediction_results.py
    script.
    When running stand-alone from the command line, this will point to the
    examples/prediction directory in the install tree (same as predictionDir)

    """

    nupic_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             "..", "..", "..", "..")

    opfDir = os.path.join(nupic_dir, "examples", "opf")

    testDir = opfDir

    # The testdir is the directory that contains the experiments we will be
    #  running. When running in the auto-build setup, this will be a temporary
    #  directory that has had this script, as well as the specific experiments
    #  we will be running, copied into it by the
    #  qa/autotest/prediction_results.py script.
    # When running stand-alone from the command line, we can simply point to the
    #  examples/prediction directory in the install tree.
    if not os.path.exists(os.path.join(testDir, "experiments/classification")):
      testDir = opfDir

    # Generate any dynamically generated datasets now
    command = ['python', os.path.join(testDir, 'experiments', 'classification',
                                       'makeDatasets.py')]
    retval = call(command)
    self.assertEqual(retval, 0)


    # Generate any dynamically generated datasets now
    command = ['python', os.path.join(testDir, 'experiments', 'multistep',
                                       'make_datasets.py')]
    retval = call(command)
    self.assertEqual(retval, 0)


    # Generate any dynamically generated datasets now
    command = ['python', os.path.join(testDir, 'experiments',
                                'spatial_classification', 'make_datasets.py')]
    retval = call(command)
    self.assertEqual(retval, 0)


    # Run from the test directory so that we can find our experiments
    os.chdir(testDir)

    runExperiment = os.path.join(nupic_dir, "scripts", "run_opf_experiment.py")

    # A list of experiments to run.  Valid attributes:
    #   experimentDir - Required, path to the experiment directory containing
    #                       description.py
    #   args          - optional. List of arguments for run_opf_experiment
    #   results       - A dictionary of expected results. The keys are tuples
    #                    containing (predictionLogFileName, columnName). The
    #                    value is a (min, max) expected value from the last row
    #                    in the prediction log.
    multistepTests = [
      # For this one, in theory the error for 1 step should be < 0.20
      { 'experimentDir': 'experiments/multistep/simple_0',
        'results': {
          ('DefaultTask.TemporalMultiStep.predictionLog.csv',
           "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"):
                    (0.0, 0.20),
        }
      },

      # For this one, in theory the error for 1 step should be < 0.50, but we
      #  get slightly higher because our sample size is smaller than ideal
      { 'experimentDir': 'experiments/multistep/simple_0_f2',
        'results': {
          ('DefaultTask.TemporalMultiStep.predictionLog.csv',
           "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"):
                    (0.0, 0.66),
        }
      },

      # For this one, in theory the error for 1 step should be < 0.20
      { 'experimentDir': 'experiments/multistep/simple_1',
        'results': {
          ('DefaultTask.TemporalMultiStep.predictionLog.csv',
           "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"):
                    (0.0, 0.20),
        }
      },

      # For this test, we haven't figured out the theoretical error, this
      #  error is determined empirically from actual results
      { 'experimentDir': 'experiments/multistep/simple_1_f2',
        'results': {
          ('DefaultTask.TemporalMultiStep.predictionLog.csv',
           "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"):
                    (0.0, 3.76),
        }
      },

      # For this one, in theory the error for 1 step should be < 0.20, but we
      #  get slightly higher because our sample size is smaller than ideal
      { 'experimentDir': 'experiments/multistep/simple_2',
        'results': {
          ('DefaultTask.TemporalMultiStep.predictionLog.csv',
           "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"):
                    (0.0, 0.31),
        }
      },

      # For this one, in theory the error for 1 step should be < 0.10 and for
      #  3 step < 0.30, but our actual results are better.
      { 'experimentDir': 'experiments/multistep/simple_3',
        'results': {
          ('DefaultTask.TemporalMultiStep.predictionLog.csv',
           "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"):
                    (0.0, 0.06),
          ('DefaultTask.TemporalMultiStep.predictionLog.csv',
           "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=3:window=200:field=field1"):
                    (0.0, 0.20),
        }
      },

      # For this test, we haven't figured out the theoretical error, this
      #  error is determined empirically from actual results
      { 'experimentDir': 'experiments/multistep/simple_3_f2',
        'results': {
          ('DefaultTask.TemporalMultiStep.predictionLog.csv',
           "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"):
                    (0.0, 0.6),
          ('DefaultTask.TemporalMultiStep.predictionLog.csv',
           "multiStepBestPredictions:multiStep:errorMetric='aae':steps=3:window=200:field=field2"):
                    (0.0, 1.8),
        }
      },

      # Test missing record support.
      # Should have 0 error by the end of the dataset
      { 'experimentDir': 'experiments/missing_record/simple_0',
        'results': {
          ('DefaultTask.NontemporalMultiStep.predictionLog.csv',
           "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=25:field=field1"):
                    (1.0, 1.0),
        }
      },

    ] # end of multistepTests

    classificationTests = [
      # ----------------------------------------------------------------------
      # Classification Experiments
      { 'experimentDir': 'experiments/classification/category_hub_TP_0',
        'results': {
            ('OnlineLearning.TemporalClassification.predictionLog.csv',
             'classification:avg_err:window=200'): (0.0, 0.020),
            }
      },

      { 'experimentDir': 'experiments/classification/category_TM_0',
        'results': {
            ('OnlineLearning.TemporalClassification.predictionLog.csv',
             'classification:avg_err:window=200'): (0.0, 0.045),

            ('OnlineLearning.TemporalClassification.predictionLog.csv',
             'classConfidences:neg_auc:computeEvery=10:window=200'): (-1.0, -0.98),
            }
      },

      { 'experimentDir': 'experiments/classification/category_TM_1',
        'results': {
            ('OnlineLearning.TemporalClassification.predictionLog.csv',
             'classification:avg_err:window=200'): (0.0, 0.005),
            }
      },

      { 'experimentDir': 'experiments/classification/scalar_TP_0',
        'results': {
            ('OnlineLearning.TemporalClassification.predictionLog.csv',
             'classification:avg_err:window=200'): (0.0, 0.155),

            ('OnlineLearning.TemporalClassification.predictionLog.csv',
             'classConfidences:neg_auc:computeEvery=10:window=200'): (-1.0, -0.900),
            }
      },

      { 'experimentDir': 'experiments/classification/scalar_TP_1',
        'results': {
            ('OnlineLearning.TemporalClassification.predictionLog.csv',
             'classification:avg_err:window=200'):  (0.0, 0.03),
            }
      },

    ] # End of classification tests
    
    spatialClassificationTests = [
      { 'experimentDir': 'experiments/spatial_classification/category_0',
        'results': {
            ('DefaultTask.NontemporalClassification.predictionLog.csv',
             "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=0:window=100:field=classification"): 
                    (0.0, 0.05),
            }

      },

      { 'experimentDir': 'experiments/spatial_classification/category_1',
        'results': {
            ('DefaultTask.NontemporalClassification.predictionLog.csv',
             "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=0:window=100:field=classification"): 
                    (0.0, 0.0),
            }
      },
      
      { 'experimentDir': 'experiments/spatial_classification/scalar_0',
        'results': {
            ('DefaultTask.NontemporalClassification.predictionLog.csv',
             "multiStepBestPredictions:multiStep:errorMetric='aae':steps=0:window=100:field=classification"): 
                    (0.0, 0.025),
            }
      },

      { 'experimentDir': 'experiments/spatial_classification/scalar_1',
        'results': {
            ('DefaultTask.NontemporalClassification.predictionLog.csv',
             "multiStepBestPredictions:multiStep:errorMetric='aae':steps=0:window=100:field=classification"): 
                    (-1e-10, 0.01),
            }
      },


    ]

    anomalyTests = [
      # ----------------------------------------------------------------------
      # Classification Experiments
      { 'experimentDir': 'experiments/anomaly/temporal/simple',
        'results': {
            ('DefaultTask.TemporalAnomaly.predictionLog.csv',
             'anomalyScore:passThruPrediction:window=1000:field=f'): (0.02,
                                                                      0.04),
          }
      },



    ] # End of anomaly tests

    tests = []
    tests += multistepTests
    tests += classificationTests
    tests += spatialClassificationTests
    tests += anomalyTests

    # Uncomment this to only run a specific experiment(s)
    #tests = tests[7:8]

    # This contains a list of tuples: (expDir, key, results)
    summaryOfResults = []
    startTime = time.time()

    testIdx = -1
    for test in tests:
      testIdx += 1
      expDirectory = test['experimentDir']

      # -------------------------------------------------------------------
      # Remove files/directories generated by previous tests:
      toDelete = []

      # Remove inference results
      path = os.path.join(expDirectory, "inference")
      toDelete.append(path)
      path = os.path.join(expDirectory, "savedmodels")
      toDelete.append(path)

      for path in toDelete:
        if not os.path.exists(path):
          continue
        print "Removing %s ..." % path
        if os.path.isfile(path):
          os.remove(path)
        else:
          shutil.rmtree(path)


      # ------------------------------------------------------------------------
      # Run the test.
      args = test.get('args', [])
      print "Running experiment %s ..." % (expDirectory)
      command = ['python', runExperiment, expDirectory] + args
      retVal = call(command)

      # If retVal is non-zero and this was not a negative test or if retVal is
      # zero and this is a negative test something went wrong.
      if retVal:
        print "Details of failed test: %s" % test
        print("TestIdx %d, OPF experiment '%s' failed with return code %i." %
              (testIdx, expDirectory, retVal))
      self.assertFalse(retVal)


      # -----------------------------------------------------------------------
      # Check the results
      for (key, expValues) in test['results'].items():
        (logFilename, colName) = key

        # Open the prediction log file
        logFile = FileRecordStream(os.path.join(expDirectory, 'inference',
                                                logFilename))
        colNames = [x[0] for x in logFile.getFields()]
        if not colName in colNames:
          print "TestIdx %d: %s not one of the columns in " \
            "prediction log file. Available column names are: %s" % (testIdx,
                    colName, colNames)
        self.assertTrue(colName in colNames)
        colIndex = colNames.index(colName)

        # Read till we get to the last line
        while True:
          try:
            row = logFile.next()
          except StopIteration:
            break
        result = row[colIndex]

        # Save summary of results
        summaryOfResults.append((expDirectory, colName, result))

        print "Actual result for %s, %s:" % (expDirectory, colName), result
        print "Expected range:", expValues
        failed = (expValues[0] is not None and result < expValues[0]) \
            or (expValues[1] is not None and result > expValues[1])
        if failed:
          print ("TestIdx %d: Experiment %s failed. \nThe actual result"
             " for %s (%s) was outside the allowed range of %s" % (testIdx,
              expDirectory, colName, result, expValues))
        else:
          print "  Within expected range."
        self.assertFalse(failed)


    # =======================================================================
    # Print summary of results:
    print
    print "Summary of results in all experiments run:"
    print "========================================="
    prevExpDir = None
    for (expDir, key, results) in summaryOfResults:
      if expDir != prevExpDir:
        print
        print expDir
        prevExpDir = expDir
      print "  %s: %s" % (key, results)

    print "\nElapsed time: %.1f seconds" % (time.time() - startTime)
    def testExperimentResults(self):
        """Run specific experiments and verify that they are producing the correct
    results.

    opfDir is the examples/opf directory in the install path
    and is used to find run_opf_experiment.py

    The testdir is the directory that contains the experiments we will be
    running. When running in the auto-build setup, this will be a temporary
    directory that has had this script, as well as the specific experiments
    we will be running, copied into it by the qa/autotest/prediction_results.py
    script.
    When running stand-alone from the command line, this will point to the
    examples/prediction directory in the install tree (same as predictionDir)

    """

        nupic_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 "..", "..", "..", "..")

        opfDir = os.path.join(nupic_dir, "examples", "opf")

        testDir = opfDir

        # The testdir is the directory that contains the experiments we will be
        #  running. When running in the auto-build setup, this will be a temporary
        #  directory that has had this script, as well as the specific experiments
        #  we will be running, copied into it by the
        #  qa/autotest/prediction_results.py script.
        # When running stand-alone from the command line, we can simply point to the
        #  examples/prediction directory in the install tree.
        if not os.path.exists(
                os.path.join(testDir, "experiments/classification")):
            testDir = opfDir

        # Generate any dynamically generated datasets now
        command = [
            'python',
            os.path.join(testDir, 'experiments', 'classification',
                         'makeDatasets.py')
        ]
        retval = call(command)
        self.assertEqual(retval, 0)

        # Generate any dynamically generated datasets now
        command = [
            'python',
            os.path.join(testDir, 'experiments', 'multistep',
                         'make_datasets.py')
        ]
        retval = call(command)
        self.assertEqual(retval, 0)

        # Generate any dynamically generated datasets now
        command = [
            'python',
            os.path.join(testDir, 'experiments', 'spatial_classification',
                         'make_datasets.py')
        ]
        retval = call(command)
        self.assertEqual(retval, 0)

        # Run from the test directory so that we can find our experiments
        os.chdir(testDir)

        runExperiment = os.path.join(nupic_dir, "scripts",
                                     "run_opf_experiment.py")

        # A list of experiments to run.  Valid attributes:
        #   experimentDir - Required, path to the experiment directory containing
        #                       description.py
        #   args          - optional. List of arguments for run_opf_experiment
        #   results       - A dictionary of expected results. The keys are tuples
        #                    containing (predictionLogFileName, columnName). The
        #                    value is a (min, max) expected value from the last row
        #                    in the prediction log.
        multistepTests = [
            # For this one, in theory the error for 1 step should be < 0.20
            {
                'experimentDir': 'experiments/multistep/simple_0',
                'results': {
                    ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"):
                    (0.0, 0.20),
                }
            },

            # For this one, in theory the error for 1 step should be < 0.50, but we
            #  get slightly higher because our sample size is smaller than ideal
            {
                'experimentDir': 'experiments/multistep/simple_0_f2',
                'results': {
                    ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"):
                    (0.0, 0.66),
                }
            },

            # For this one, in theory the error for 1 step should be < 0.20
            {
                'experimentDir': 'experiments/multistep/simple_1',
                'results': {
                    ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"):
                    (0.0, 0.20),
                }
            },

            # For this test, we haven't figured out the theoretical error, this
            #  error is determined empirically from actual results
            {
                'experimentDir': 'experiments/multistep/simple_1_f2',
                'results': {
                    ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"):
                    (0.0, 3.76),
                }
            },

            # For this one, in theory the error for 1 step should be < 0.20, but we
            #  get slightly higher because our sample size is smaller than ideal
            {
                'experimentDir': 'experiments/multistep/simple_2',
                'results': {
                    ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"):
                    (0.0, 0.31),
                }
            },

            # For this one, in theory the error for 1 step should be < 0.10 and for
            #  3 step < 0.30, but our actual results are better.
            {
                'experimentDir': 'experiments/multistep/simple_3',
                'results': {
                    ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=200:field=field1"):
                    (0.0, 0.06),
                    ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=3:window=200:field=field1"):
                    (0.0, 0.20),
                }
            },

            # For this test, we haven't figured out the theoretical error, this
            #  error is determined empirically from actual results
            {
                'experimentDir': 'experiments/multistep/simple_3_f2',
                'results': {
                    ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=1:window=200:field=field2"):
                    (0.0, 0.6),
                    ('DefaultTask.TemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=3:window=200:field=field2"):
                    (0.0, 1.8),
                }
            },

            # Test missing record support.
            # Should have 0 error by the end of the dataset
            {
                'experimentDir': 'experiments/missing_record/simple_0',
                'results': {
                    ('DefaultTask.NontemporalMultiStep.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=1:window=25:field=field1"):
                    (1.0, 1.0),
                }
            },
        ]  # end of multistepTests

        classificationTests = [
            # ----------------------------------------------------------------------
            # Classification Experiments
            {
                'experimentDir':
                'experiments/classification/category_hub_TP_0',
                'results': {
                    ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'):
                    (0.0, 0.020),
                }
            },
            {
                'experimentDir': 'experiments/classification/category_TM_0',
                'results': {
                    ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'):
                    (0.0, 0.045),
                    ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classConfidences:neg_auc:computeEvery=10:window=200'):
                    (-1.0, -0.98),
                }
            },
            {
                'experimentDir': 'experiments/classification/category_TM_1',
                'results': {
                    ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'):
                    (0.0, 0.005),
                }
            },
            {
                'experimentDir': 'experiments/classification/scalar_TP_0',
                'results': {
                    ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'):
                    (0.0, 0.155),
                    ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classConfidences:neg_auc:computeEvery=10:window=200'):
                    (-1.0, -0.900),
                }
            },
            {
                'experimentDir': 'experiments/classification/scalar_TP_1',
                'results': {
                    ('OnlineLearning.TemporalClassification.predictionLog.csv', 'classification:avg_err:window=200'):
                    (0.0, 0.03),
                }
            },
        ]  # End of classification tests

        spatialClassificationTests = [
            {
                'experimentDir':
                'experiments/spatial_classification/category_0',
                'results': {
                    ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=0:window=100:field=classification"):
                    (0.0, 0.05),
                }
            },
            {
                'experimentDir':
                'experiments/spatial_classification/category_1',
                'results': {
                    ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='avg_err':steps=0:window=100:field=classification"):
                    (0.0, 0.0),
                }
            },
            {
                'experimentDir': 'experiments/spatial_classification/scalar_0',
                'results': {
                    ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=0:window=100:field=classification"):
                    (0.0, 0.025),
                }
            },
            {
                'experimentDir': 'experiments/spatial_classification/scalar_1',
                'results': {
                    ('DefaultTask.NontemporalClassification.predictionLog.csv', "multiStepBestPredictions:multiStep:errorMetric='aae':steps=0:window=100:field=classification"):
                    (-1e-10, 0.01),
                }
            },
        ]

        anomalyTests = [
            # ----------------------------------------------------------------------
            # Classification Experiments
            {
                'experimentDir': 'experiments/anomaly/temporal/simple',
                'results': {
                    ('DefaultTask.TemporalAnomaly.predictionLog.csv', 'anomalyScore:passThruPrediction:window=1000:field=f'):
                    (0.02, 0.04),
                }
            },
        ]  # End of anomaly tests

        tests = []
        tests += multistepTests
        tests += classificationTests
        tests += spatialClassificationTests
        tests += anomalyTests

        # Uncomment this to only run a specific experiment(s)
        #tests = tests[7:8]

        # This contains a list of tuples: (expDir, key, results)
        summaryOfResults = []
        startTime = time.time()

        testIdx = -1
        for test in tests:
            testIdx += 1
            expDirectory = test['experimentDir']

            # -------------------------------------------------------------------
            # Remove files/directories generated by previous tests:
            toDelete = []

            # Remove inference results
            path = os.path.join(expDirectory, "inference")
            toDelete.append(path)
            path = os.path.join(expDirectory, "savedmodels")
            toDelete.append(path)

            for path in toDelete:
                if not os.path.exists(path):
                    continue
                print("Removing %s ..." % path)
                if os.path.isfile(path):
                    os.remove(path)
                else:
                    shutil.rmtree(path)

            # ------------------------------------------------------------------------
            # Run the test.
            args = test.get('args', [])
            print("Running experiment %s ..." % (expDirectory))
            command = ['python', runExperiment, expDirectory] + args
            retVal = call(command)

            # If retVal is non-zero and this was not a negative test or if retVal is
            # zero and this is a negative test something went wrong.
            if retVal:
                print("Details of failed test: %s" % test)
                print((
                    "TestIdx %d, OPF experiment '%s' failed with return code %i."
                    % (testIdx, expDirectory, retVal)))
            self.assertFalse(retVal)

            # -----------------------------------------------------------------------
            # Check the results
            for (key, expValues) in list(test['results'].items()):
                (logFilename, colName) = key

                # Open the prediction log file
                logFile = FileRecordStream(
                    os.path.join(expDirectory, 'inference', logFilename))
                colNames = [x[0] for x in logFile.getFields()]
                if not colName in colNames:
                    print("TestIdx %d: %s not one of the columns in " \
                      "prediction log file. Available column names are: %s" % (testIdx,
                              colName, colNames))
                self.assertTrue(colName in colNames)
                colIndex = colNames.index(colName)

                # Read till we get to the last line
                while True:
                    try:
                        row = next(logFile)
                    except StopIteration:
                        break
                result = row[colIndex]

                # Save summary of results
                summaryOfResults.append((expDirectory, colName, result))

                print("Actual result for %s, %s:" % (expDirectory, colName),
                      result)
                print("Expected range:", expValues)
                failed = (expValues[0] is not None and result < expValues[0]) \
                    or (expValues[1] is not None and result > expValues[1])
                if failed:
                    print((
                        "TestIdx %d: Experiment %s failed. \nThe actual result"
                        " for %s (%s) was outside the allowed range of %s" %
                        (testIdx, expDirectory, colName, result, expValues)))
                else:
                    print("  Within expected range.")
                self.assertFalse(failed)

        # =======================================================================
        # Print summary of results:
        print()
        print("Summary of results in all experiments run:")
        print("=========================================")
        prevExpDir = None
        for (expDir, key, results) in summaryOfResults:
            if expDir != prevExpDir:
                print()
                print(expDir)
                prevExpDir = expDir
            print("  %s: %s" % (key, results))

        print("\nElapsed time: %.1f seconds" % (time.time() - startTime))
Beispiel #8
0
def generateStats(filename, maxSamples = None,):
  """
  Collect statistics for each of the fields in the user input data file and
  return a stats dict object.

  Parameters:
  ------------------------------------------------------------------------------
  filename:             The path and name of the data file.
  maxSamples:           Upper bound on the number of rows to be processed
  retval:               A dictionary of dictionaries. The top level keys are the
                        field names and the corresponding values are the statistics
                        collected for the individual file.
                        Example:
                        {
                          'consumption':{'min':0,'max':90,'mean':50,...},
                          'gym':{'numDistinctCategories':10,...},
                          ...
                         }


  """
  # Mapping from field type to stats collector object
  statsCollectorMapping = {'float':    FloatStatsCollector,
                           'int':      IntStatsCollector,
                           'string':   StringStatsCollector,
                           'datetime': DateTimeStatsCollector,
                           'bool':     BoolStatsCollector,
                           }

  filename = resource_filename("nupic.datafiles", filename)
  print "*"*40
  print "Collecting statistics for file:'%s'" % (filename,)
  dataFile = FileRecordStream(filename)

  # Initialize collector objects
  # statsCollectors list holds statsCollector objects for each field
  statsCollectors = []
  for fieldName, fieldType, fieldSpecial in dataFile.getFields():
    # Find the corresponding stats collector for each field based on field type
    # and intialize an instance
    statsCollector = \
            statsCollectorMapping[fieldType](fieldName, fieldType, fieldSpecial)
    statsCollectors.append(statsCollector)

  # Now collect the stats
  if maxSamples is None:
    maxSamples = 500000
  for i in xrange(maxSamples):
    record = dataFile.getNextRecord()
    if record is None:
      break
    for i, value in enumerate(record):
      statsCollectors[i].addValue(value)

  # stats dict holds the statistics for each field
  stats = {}
  for statsCollector in statsCollectors:
    statsCollector.getStats(stats)

  # We don't want to include reset field in permutations
  # TODO: handle reset field in a clean way
  if dataFile.getResetFieldIdx() is not None:
    resetFieldName,_,_ = dataFile.getFields()[dataFile.reset]
    stats.pop(resetFieldName)

  if VERBOSITY > 0:
    pprint.pprint(stats)

  return stats
Beispiel #9
0
def generateDataset(aggregationInfo, inputFilename, outputFilename=None):
  """Generate a dataset of aggregated values

  Parameters:
  ----------------------------------------------------------------------------
  aggregationInfo: a dictionary that contains the following entries
    - fields: a list of pairs. Each pair is a field name and an
      aggregation function (e.g. sum). The function will be used to aggregate
      multiple values during the aggregation period.

  aggregation period: 0 or more of unit=value fields; allowed units are:
        [years months] |
        [weeks days hours minutes seconds milliseconds microseconds]
        NOTE: years and months are mutually-exclusive with the other units.
              See getEndTime() and _aggregate() for more details.
        Example1: years=1, months=6,
        Example2: hours=1, minutes=30,
        If none of the period fields are specified or if all that are specified
        have values of 0, then aggregation will be suppressed, and the given
        inputFile parameter value will be returned.

  inputFilename: filename of the input dataset within examples/prediction/data

  outputFilename: name for the output file. If not given, a name will be
        generated based on the input filename and the aggregation params

  retval: Name of the generated output file. This will be the same as the input
      file name if no aggregation needed to be performed



  If the input file contained a time field, sequence id field or reset field
  that were not specified in aggregationInfo fields, those fields will be
  added automatically with the following rules:

  1. The order will be R, S, T, rest of the fields
  2. The aggregation function for all will be to pick the first: lambda x: x[0]

    Returns: the path of the aggregated data file if aggregation was performed
      (in the same directory as the given input file); if aggregation did not
      need to be performed, then the given inputFile argument value is returned.
  """



  # Create the input stream
  inputFullPath = resource_filename("nupic.datafiles", inputFilename)
  inputObj = FileRecordStream(inputFullPath)


  # Instantiate the aggregator
  aggregator = Aggregator(aggregationInfo=aggregationInfo,
                          inputFields=inputObj.getFields())


  # Is it a null aggregation? If so, just return the input file unmodified
  if aggregator.isNullAggregation():
    return inputFullPath


  # ------------------------------------------------------------------------
  # If we were not given an output filename, create one based on the
  #  aggregation settings
  if outputFilename is None:
    outputFilename = 'agg_%s' % \
                        os.path.splitext(os.path.basename(inputFullPath))[0]
    timePeriods = 'years months weeks days '\
                  'hours minutes seconds milliseconds microseconds'
    for k in timePeriods.split():
      if aggregationInfo.get(k, 0) > 0:
        outputFilename += '_%s_%d' % (k, aggregationInfo[k])

    outputFilename += '.csv'
    outputFilename = os.path.join(os.path.dirname(inputFullPath), outputFilename)



  # ------------------------------------------------------------------------
  # If some other process already started creating this file, simply
  #   wait for it to finish and return without doing anything
  lockFilePath = outputFilename + '.please_wait'
  if os.path.isfile(outputFilename) or \
     os.path.isfile(lockFilePath):
    while os.path.isfile(lockFilePath):
      print('Waiting for %s to be fully written by another process' % \
            lockFilePath)
      time.sleep(1)
    return outputFilename


  # Create the lock file
  lockFD = open(lockFilePath, 'w')



  # -------------------------------------------------------------------------
  # Create the output stream
  outputObj = FileRecordStream(streamID=outputFilename, write=True,
                               fields=inputObj.getFields())


  # -------------------------------------------------------------------------
  # Write all aggregated records to the output
  while True:
    inRecord = inputObj.getNextRecord()

    (aggRecord, aggBookmark) = aggregator.next(inRecord, None)

    if aggRecord is None and inRecord is None:
      break

    if aggRecord is not None:
      outputObj.appendRecord(aggRecord)

  return outputFilename
Beispiel #10
0
def generateDataset(aggregationInfo, inputFilename, outputFilename=None):
  """Generate a dataset of aggregated values

  Parameters:
  ----------------------------------------------------------------------------
  aggregationInfo: a dictionary that contains the following entries
    - fields: a list of pairs. Each pair is a field name and an
      aggregation function (e.g. sum). The function will be used to aggregate
      multiple values during the aggregation period.

  aggregation period: 0 or more of unit=value fields; allowed units are:
        [years months] |
        [weeks days hours minutes seconds milliseconds microseconds]
        NOTE: years and months are mutually-exclusive with the other units.
              See getEndTime() and _aggregate() for more details.
        Example1: years=1, months=6,
        Example2: hours=1, minutes=30,
        If none of the period fields are specified or if all that are specified
        have values of 0, then aggregation will be suppressed, and the given
        inputFile parameter value will be returned.

  inputFilename: filename (or relative path form NTA_DATA_PATH) of
               the input dataset
               
  outputFilename: name for the output file. If not given, a name will be
        generated based on the input filename and the aggregation params
        
  retval: Name of the generated output file. This will be the same as the input
      file name if no aggregation needed to be performed
        
  

  If the input file contained a time field, sequence id field or reset field
  that were not specified in aggregationInfo fields, those fields will be
  added automatically with the following rules:

  1. The order will be R, S, T, rest of the fields
  2. The aggregation function for all will be to pick the first: lambda x: x[0]

    Returns: the path of the aggregated data file if aggregation was performed
      (in the same directory as the given input file); if aggregation did not
      need to be performed, then the given inputFile argument value is returned.
  """



  # Create the input stream
  inputFullPath = findDataset(inputFilename)
  inputObj = FileRecordStream(inputFullPath)
  

  # Instantiate the aggregator
  aggregator = Aggregator(aggregationInfo=aggregationInfo, 
                          inputFields=inputObj.getFields())
  
  
  # Is it a null aggregation? If so, just return the input file unmodified
  if aggregator.isNullAggregation():
    return inputFullPath


  # ------------------------------------------------------------------------
  # If we were not given an output filename, create one based on the 
  #  aggregation settings
  if outputFilename is None:
    outputFilename = 'agg_%s' % \
                        os.path.splitext(os.path.basename(inputFullPath))[0]
    timePeriods = 'years months weeks days '\
                  'hours minutes seconds milliseconds microseconds'
    for k in timePeriods.split():
      if aggregationInfo.get(k, 0) > 0:
        outputFilename += '_%s_%d' % (k, aggregationInfo[k])
  
    outputFilename += '.csv'
    outputFilename = os.path.join(os.path.dirname(inputFullPath), outputFilename)



  # ------------------------------------------------------------------------
  # If some other process already started creating this file, simply 
  #   wait for it to finish and return without doing anything
  lockFilePath = outputFilename + '.please_wait'
  if os.path.isfile(outputFilename) or \
     os.path.isfile(lockFilePath):
    while os.path.isfile(lockFilePath):
      print 'Waiting for %s to be fully written by another process' % \
            lockFilePath
      time.sleep(1)
    return outputFilename


  # Create the lock file
  lockFD = open(lockFilePath, 'w')



  # -------------------------------------------------------------------------
  # Create the output stream
  outputObj = FileRecordStream(streamID=outputFilename, write=True,
                               fields=inputObj.getFields())


  # -------------------------------------------------------------------------
  # Write all aggregated records to the output
  while True:
    inRecord = inputObj.getNextRecord()
    
    (aggRecord, aggBookmark) = aggregator.next(inRecord, None)
    
    if aggRecord is None and inRecord is None:
      break
    
    if aggRecord is not None:
      outputObj.appendRecord(aggRecord)

  return outputFilename
Beispiel #11
0
    def testMissingValues(self):

        print "Beginning Missing Data test..."
        filename = _getTempFileName()

        # Some values missing of each type
        # read dataset from disk, retrieve values
        # string should return empty string, numeric types sentinelValue

        print 'Creating tempfile:', filename

        # write dataset to disk with float, int, and string fields
        fields = [
            FieldMetaInfo('timestamp', FieldMetaType.datetime,
                          FieldMetaSpecial.timestamp),
            FieldMetaInfo('name', FieldMetaType.string, FieldMetaSpecial.none),
            FieldMetaInfo('integer', FieldMetaType.integer,
                          FieldMetaSpecial.none),
            FieldMetaInfo('real', FieldMetaType.float, FieldMetaSpecial.none)
        ]
        s = FileRecordStream(streamID=filename, write=True, fields=fields)

        # Records
        records = ([datetime(day=1, month=3, year=2010), 'rec_1', 5, 6.5], [
            datetime(day=2, month=3, year=2010), '', 8, 7.5
        ], [datetime(day=3, month=3, year=2010), 'rec_3', '', 8.5], [
            datetime(day=4, month=3, year=2010), 'rec_4', 12, ''
        ], [datetime(day=5, month=3, year=2010), 'rec_5', -87657496599, 6.5], [
            datetime(day=6, month=3, year=2010), 'rec_6', 12, -87657496599
        ], [datetime(day=6, month=3, year=2010),
            str(-87657496599), 12, 6.5])

        for r in records:
            s.appendRecord(list(r))

        s.close()

        # Read the standard file
        s = FileRecordStream(streamID=filename, write=False)

        fieldsRead = s.getFields()
        self.assertEqual(fields, fieldsRead)

        recordsRead = []
        while True:
            r = s.getNextRecord()
            if r is None:
                break
            print 'Reading record ...'
            print r
            recordsRead.append(r)

        # sort the records by date, so we know for sure which is which
        sorted(recordsRead, key=lambda rec: rec[0])

        # empty string
        self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[1][1])

        # missing int
        self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[2][2])

        # missing float
        self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[3][3])

        # sentinel value in input handled correctly for int field
        self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[4][2])

        # sentinel value in input handled correctly for float field
        self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[5][3])

        # sentinel value in input handled correctly for string field
        # this should leave the string as-is, since a missing string
        # is encoded not with a sentinel value but with an empty string
        self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[6][1])
  def testBasic(self):
    """Runs basic FileRecordStream tests."""
    filename = _getTempFileName()

    # Write a standard file
    fields = [('name', 'string', ''),
              ('timestamp', 'datetime', 'T'),
              ('integer', 'int', ''),
              ('real', 'float', ''),
              ('reset', 'int', 'R'),
              ('sid', 'string', 'S'),
              ('categoryField', 'int', 'C'),]
    fieldNames = ['name', 'timestamp', 'integer', 'real', 'reset', 'sid',
                  'categoryField']

    print 'Creating temp file:', filename

    s = FileRecordStream(streamID=filename, write=True, fields=fields)

    self.assertTrue(s.getDataRowCount() == 0)

    # Records
    records = (
      ['rec_1', datetime(day=1, month=3, year=2010), 5, 6.5, 1, 'seq-1', 10],
      ['rec_2', datetime(day=2, month=3, year=2010), 8, 7.5, 0, 'seq-1', 11],
      ['rec_3', datetime(day=3, month=3, year=2010), 12, 8.5, 0, 'seq-1', 12])

    self.assertTrue(s.getFields() == fields)
    self.assertTrue(s.getNextRecordIdx() == 0)

    print 'Writing records ...'
    for r in records:
      print list(r)
      s.appendRecord(list(r))

    self.assertTrue(s.getDataRowCount() == 3)

    recordsBatch = (
      ['rec_4', datetime(day=4, month=3, year=2010), 2, 9.5, 1, 'seq-1', 13],
      ['rec_5', datetime(day=5, month=3, year=2010), 6, 10.5, 0, 'seq-1', 14],
      ['rec_6', datetime(day=6, month=3, year=2010), 11, 11.5, 0, 'seq-1', 15])

    print 'Adding batch of records...'
    for rec in recordsBatch:
      print rec
    s.appendRecords(recordsBatch)
    self.assertTrue(s.getDataRowCount() == 6)

    s.close()

    # Read the standard file
    s = FileRecordStream(filename)
    self.assertTrue(s.getDataRowCount() == 6)
    self.assertTrue(s.getFieldNames() == fieldNames)

    # Note! this is the number of records read so far
    self.assertTrue(s.getNextRecordIdx() == 0)

    readStats = s.getStats()
    print 'Got stats:', readStats
    expectedStats = {
                     'max': [None, None, 12, 11.5, 1, None, 15],
                     'min': [None, None, 2, 6.5, 0, None, 10]
                    }
    self.assertTrue(readStats == expectedStats)

    readRecords = []
    print 'Reading records ...'
    while True:
      r = s.getNextRecord()
      print r
      if r is None:
        break

      readRecords.append(r)

    allRecords = records + recordsBatch
    for r1, r2 in zip(allRecords, readRecords):
      print 'Expected:', r1
      print 'Read    :', r2
      self.assertTrue(r1 == r2)

    s.close()
  def testMissingValues(self):

    print "Beginning Missing Data test..."
    filename = _getTempFileName()

    # Some values missing of each type
    # read dataset from disk, retrieve values
    # string should return empty string, numeric types sentinelValue

    print 'Creating tempfile:', filename

    # write dataset to disk with float, int, and string fields
    fields = [('timestamp', 'datetime', 'T'),
              ('name', 'string', ''),
              ('integer', 'int', ''),
              ('real', 'float', '')]
    s = FileRecordStream(streamID=filename, write=True, fields=fields)

    # Records
    records = (
      [datetime(day=1, month=3, year=2010), 'rec_1', 5, 6.5],
      [datetime(day=2, month=3, year=2010), '', 8, 7.5],
      [datetime(day=3, month=3, year=2010), 'rec_3', '', 8.5],
      [datetime(day=4, month=3, year=2010), 'rec_4', 12, ''],
      [datetime(day=5, month=3, year=2010), 'rec_5', -87657496599, 6.5],
      [datetime(day=6, month=3, year=2010), 'rec_6', 12, -87657496599],
      [datetime(day=6, month=3, year=2010), str(-87657496599), 12, 6.5])

    for r in records:
      s.appendRecord(list(r))

    s.close()

    # Read the standard file
    s = FileRecordStream(streamID=filename, write=False)

    fieldsRead = s.getFields()
    self.assertTrue(fields == fieldsRead)

    recordsRead = []
    while True:
      r = s.getNextRecord()
      if r is None:
        break
      print 'Reading record ...'
      print r
      recordsRead.append(r)

    # sort the records by date, so we know for sure which is which
    sorted(recordsRead, key=lambda rec: rec[0])

    # empty string
    self.assertTrue(recordsRead[1][1] == SENTINEL_VALUE_FOR_MISSING_DATA)

    # missing int
    self.assertTrue(recordsRead[2][2] == SENTINEL_VALUE_FOR_MISSING_DATA)

    # missing float
    self.assertTrue(recordsRead[3][3] == SENTINEL_VALUE_FOR_MISSING_DATA)

    # sentinel value in input handled correctly for int field
    self.assertTrue(recordsRead[4][2] != SENTINEL_VALUE_FOR_MISSING_DATA)

    # sentinel value in input handled correctly for float field
    self.assertTrue(recordsRead[5][3] != SENTINEL_VALUE_FOR_MISSING_DATA)

    # sentinel value in input handled correctly for string field
    # this should leave the string as-is, since a missing string
    # is encoded not with a sentinel value but with an empty string
    self.assertTrue(recordsRead[6][1] != SENTINEL_VALUE_FOR_MISSING_DATA)
Beispiel #14
0
    def _testSamePredictions(self,
                             experiment,
                             predSteps,
                             checkpointAt,
                             predictionsFilename,
                             additionalFields=None):
        """ Test that we get the same predictions out from the following two
    scenarios:

    a_plus_b: Run the network for 'a' iterations followed by 'b' iterations
    a, followed by b: Run the network for 'a' iterations, save it, load it
                      back in, then run for 'b' iterations.

    Parameters:
    -----------------------------------------------------------------------
    experiment:   base directory of the experiment. This directory should
                    contain the following:
                        base.py
                        a_plus_b/description.py
                        a/description.py
                        b/description.py
                    The sub-directory description files should import the
                    base.py and only change the first and last record used
                    from the data file.
    predSteps:   Number of steps ahead predictions are for
    checkpointAt: Number of iterations that 'a' runs for.
                 IMPORTANT: This must match the number of records that
                 a/description.py runs for - it is NOT dynamically stuffed into
                 the a/description.py.
    predictionsFilename: The name of the predictions file that the OPF
                  generates for this experiment (for example
                  'DefaulTask.NontemporalMultiStep.predictionLog.csv')
    """

        # Get the 3 sub-experiment directories
        aPlusBExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "a_plus_b")
        aExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "a")
        bExpDir = os.path.join(_EXPERIMENT_BASE, experiment, "b")

        # Run a+b
        _aPlusBExp = runExperiment(args=[aPlusBExpDir])

        # Run a, the copy the saved checkpoint into the b directory
        _aExp = runExperiment(args=[aExpDir])
        if os.path.exists(os.path.join(bExpDir, 'savedmodels')):
            shutil.rmtree(os.path.join(bExpDir, 'savedmodels'))
        shutil.copytree(src=os.path.join(aExpDir, 'savedmodels'),
                        dst=os.path.join(bExpDir, 'savedmodels'))

        _bExp = runExperiment(args=[bExpDir, '--load=DefaultTask'])

        # Now, compare the predictions at the end of a+b to those in b.
        aPlusBPred = FileRecordStream(
            os.path.join(aPlusBExpDir, 'inference', predictionsFilename))
        bPred = FileRecordStream(
            os.path.join(bExpDir, 'inference', predictionsFilename))

        colNames = [x[0] for x in aPlusBPred.getFields()]
        actValueColIdx = colNames.index('multiStepPredictions.actual')
        predValueColIdx = colNames.index('multiStepPredictions.%d' %
                                         (predSteps))

        # Skip past the 'a' records in aPlusB
        for i in range(checkpointAt):
            aPlusBPred.next()

        # Now, read through the records that don't have predictions yet
        for i in range(predSteps):
            aPlusBPred.next()
            bPred.next()

        # Now, compare predictions in the two files
        rowIdx = checkpointAt + predSteps + 4 - 1
        epsilon = 0.0001
        while True:
            rowIdx += 1
            try:
                rowAPB = aPlusBPred.next()
                rowB = bPred.next()

                # Compare actuals
                self.assertEqual(
                    rowAPB[actValueColIdx], rowB[actValueColIdx],
                    "Mismatch in actual values: row %d of a+b has %s and row %d of "
                    "b has %s" % (rowIdx, rowAPB[actValueColIdx],
                                  rowIdx - checkpointAt, rowB[actValueColIdx]))

                # Compare predictions, within nearest epsilon
                predAPB = eval(rowAPB[predValueColIdx])
                predB = eval(rowB[predValueColIdx])

                # Sort with highest probabilities first
                predAPB = [(a, b) for b, a in predAPB.items()]
                predB = [(a, b) for b, a in predB.items()]
                predAPB.sort(reverse=True)
                predB.sort(reverse=True)

                if additionalFields is not None:
                    for additionalField in additionalFields:
                        fieldIdx = colNames.index(additionalField)
                        self.assertEqual(rowAPB[fieldIdx], rowB[fieldIdx],
                          "Mismatch in field \'%s\' values: row %d of a+b has value: (%s)\n"
                          " and row %d of b has value: %s" % \
                          (additionalField, rowIdx, rowAPB[fieldIdx],
                            rowIdx-checkpointAt, rowB[fieldIdx]))

                self.assertEqual(len(predAPB), len(predB),
                      "Mismatch in predicted values: row %d of a+b has %d predictions: "
                      "\n  (%s) and row %d of b has %d predictions:\n  (%s)" % \
                      (rowIdx, len(predAPB), predAPB, rowIdx-checkpointAt, len(predB),
                       predB))

                for i in range(len(predAPB)):
                    (aProb, aValue) = predAPB[i]
                    (bProb, bValue) = predB[i]
                    self.assertLess(
                        abs(aValue - bValue), epsilon,
                        "Mismatch in predicted values: row %d of a+b predicts value %s "
                        "and row %d of b predicts %s" %
                        (rowIdx, aValue, rowIdx - checkpointAt, bValue))
                    self.assertLess(abs(aProb-bProb), epsilon,
                        "Mismatch in probabilities: row %d of a+b predicts %s with "
                        "probability %s and row %d of b predicts %s with probability %s" \
                         % (rowIdx, aValue, aProb, rowIdx-checkpointAt, bValue, bProb))

            except StopIteration:
                break

        # clean up model checkpoint directories
        shutil.rmtree(getCheckpointParentDir(aExpDir))
        shutil.rmtree(getCheckpointParentDir(bExpDir))
        shutil.rmtree(getCheckpointParentDir(aPlusBExpDir))

        print "Predictions match!"
    def testMissingValues(self):

        print "Beginning Missing Data test..."
        filename = _getTempFileName()

        # Some values missing of each type
        # read dataset from disk, retrieve values
        # string should return empty string, numeric types sentinelValue

        print "Creating tempfile:", filename

        # write dataset to disk with float, int, and string fields
        fields = [("timestamp", "datetime", "T"), ("name", "string", ""), ("integer", "int", ""), ("real", "float", "")]
        s = FileRecordStream(streamID=filename, write=True, fields=fields)

        # Records
        records = (
            [datetime(day=1, month=3, year=2010), "rec_1", 5, 6.5],
            [datetime(day=2, month=3, year=2010), "", 8, 7.5],
            [datetime(day=3, month=3, year=2010), "rec_3", "", 8.5],
            [datetime(day=4, month=3, year=2010), "rec_4", 12, ""],
            [datetime(day=5, month=3, year=2010), "rec_5", -87657496599, 6.5],
            [datetime(day=6, month=3, year=2010), "rec_6", 12, -87657496599],
            [datetime(day=6, month=3, year=2010), str(-87657496599), 12, 6.5],
        )

        for r in records:
            s.appendRecord(list(r))

        s.close()

        # Read the standard file
        s = FileRecordStream(streamID=filename, write=False)

        fieldsRead = s.getFields()
        self.assertEqual(fields, fieldsRead)

        recordsRead = []
        while True:
            r = s.getNextRecord()
            if r is None:
                break
            print "Reading record ..."
            print r
            recordsRead.append(r)

        # sort the records by date, so we know for sure which is which
        sorted(recordsRead, key=lambda rec: rec[0])

        # empty string
        self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[1][1])

        # missing int
        self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[2][2])

        # missing float
        self.assertEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[3][3])

        # sentinel value in input handled correctly for int field
        self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[4][2])

        # sentinel value in input handled correctly for float field
        self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[5][3])

        # sentinel value in input handled correctly for string field
        # this should leave the string as-is, since a missing string
        # is encoded not with a sentinel value but with an empty string
        self.assertNotEqual(SENTINEL_VALUE_FOR_MISSING_DATA, recordsRead[6][1])