Beispiel #1
0
    def test_WeightedMean(self):
        # Cleanup old files
        #for f in glob.glob('*.*'):
        #  if 'auto_specials' in f:
        #    os.remove(f)

        fields = [
            ('dummy1', 'int', ''),
            ('dummy2', 'int', ''),
            ('timestamp', 'datetime', 'T'),
        ]

        records = (
            [10, 1, datetime.datetime(2000, 3, 1)],
            [5, 2, datetime.datetime(2000, 3, 2)],
            [1, 100, datetime.datetime(2000, 3, 3)],
            [2, 4, datetime.datetime(2000, 3, 4)],
            [4, 1, datetime.datetime(2000, 3, 5)],
            [4, 0, datetime.datetime(2000, 3, 6)],
            [5, 0, datetime.datetime(2000, 3, 7)],
            [6, 0, datetime.datetime(2000, 3, 8)],
        )

        if not os.path.isdir('data'):
            os.makedirs('data')

        with FileRecordStream('data/weighted_mean.csv', write=True, fields=fields) \
              as o:
            for r in records:
                o.appendRecord(r)

        # Aggregate just the dummy field, all the specials should be added
        ai = dict(fields=[('dummy1', 'wmean:dummy2', None),
                          ('dummy2', 'mean', None)],
                  days=2)

        handle = \
          tempfile.NamedTemporaryFile(prefix='weighted_mean',
            suffix='.csv',
            dir='.')
        tempFile = handle.name
        handle.close()

        outputFile = generateDataset(ai, 'weighted_mean.csv', tempFile)

        result = []
        with FileRecordStream(outputFile) as f:
            print f.getFields()
            for r in f:
                result.append(r)

        self.assertEqual(result[0][0], 6.0)
        self.assertEqual(result[0][1], 1.0)
        self.assertEqual(result[1][0], 1.0)
        self.assertEqual(result[1][1], 52.0)
        self.assertEqual(result[2][0], 4.0)
        self.assertEqual(result[2][1], 0.0)
        self.assertEqual(result[3][0], None)
        self.assertEqual(result[3][1], 0.0)
        return
Beispiel #2
0
  def test_WeightedMean(self):
    # Cleanup old files
    #for f in glob.glob('*.*'):
    #  if 'auto_specials' in f:
    #    os.remove(f)


    fields = [('dummy1', 'int', ''),
              ('dummy2', 'int', ''),
              ('timestamp', 'datetime', 'T'),
              ]

    records = (
      [10, 1, datetime.datetime(2000, 3, 1)],
      [5, 2, datetime.datetime(2000, 3, 2)],
      [1, 100, datetime.datetime(2000, 3, 3)],
      [2, 4, datetime.datetime(2000, 3, 4)],
      [4, 1, datetime.datetime(2000, 3, 5)],
      [4, 0, datetime.datetime(2000, 3, 6)],
      [5, 0, datetime.datetime(2000, 3, 7)],
      [6, 0, datetime.datetime(2000, 3, 8)],
    )

    with FileRecordStream(resource_filename('nupic.datafiles', 'weighted_mean.csv'), write=True, fields=fields) \
          as o:
      for r in records:
        o.appendRecord(r)

    # Aggregate just the dummy field, all the specials should be added
    ai = dict(
      fields=[('dummy1', 'wmean:dummy2', None),
              ('dummy2', 'mean', None)],
      days=2
      )
    
    handle = \
      tempfile.NamedTemporaryFile(prefix='weighted_mean', 
        suffix='.csv',
        dir='.')
    tempFile = handle.name
    handle.close()    

    outputFile = generateDataset(ai, 'weighted_mean.csv', tempFile)

    result = []
    with FileRecordStream(outputFile) as f:
      print f.getFields()
      for r in f:
        result.append(r)

    self.assertEqual(result[0][0], 6.0)
    self.assertEqual(result[0][1], 1.0)
    self.assertEqual(result[1][0], 1.0)
    self.assertEqual(result[1][1], 52.0)
    self.assertEqual(result[2][0], 4.0)
    self.assertEqual(result[2][1], 0.0)
    self.assertEqual(result[3][0], None)
    self.assertEqual(result[3][1], 0.0)
    return
Beispiel #3
0
  def test_GenerateDataset(self):
    dataset = 'extra/gym/gym.csv'

    print "Using input dataset: ", dataset

    gymFileds = None
    with FileRecordStream(findDataset(dataset)) as f:
      gymFields = f.getFieldNames()

    aggregationOptions = dict(
      timeField=gymFields.index('timestamp'),
      fields=[('attendeeCount', sum),
              ('consumption', sum),
              ('timestamp', lambda x: x[0])],

      hours=5
      )
    
    handle = \
      tempfile.NamedTemporaryFile(prefix='agg_gym_hours_5', 
        suffix='.csv', 
        dir=os.path.dirname(findDataset(dataset)))
    outputFile = handle.name
    handle.close()

    print "Expected outputFile path: ", outputFile

    print "Files in the destination folder before the test:"
    print os.listdir(os.path.abspath(os.path.dirname(findDataset(dataset))))

    if os.path.isfile(outputFile):
      print "Removing existing outputFile: ", outputFile
      os.remove(outputFile)

    self.assertFalse(os.path.exists(outputFile),
                     msg="Shouldn't exist, but does: " + str(outputFile))

    result = generateDataset(aggregationOptions, dataset, outputFile)
    print "generateDataset() returned: ", result

    f1 = os.path.abspath(os.path.normpath(result))
    print "normalized generateDataset() result path: ", f1
    f2 = os.path.normpath(outputFile)
    print "normalized outputFile path: ", f2
    self.assertEqual(f1, f2)

    print "Checking for presence of outputFile: ", outputFile
    self.assertTrue(
      os.path.isfile(outputFile),
      msg="Missing outputFile: %r; normalized generateDataset() result: %r" % (
        outputFile, f1))

    print "Files in the destination folder after the test:"
    print os.listdir(os.path.abspath(os.path.dirname(findDataset(dataset))))

    print result
    print '-' * 30

    return
Beispiel #4
0
    def test_GenerateDataset(self):
        dataset = 'extra/gym/gym.csv'

        print "Using input dataset: ", dataset

        gymFileds = None
        with FileRecordStream(findDataset(dataset)) as f:
            gymFields = f.getFieldNames()

        aggregationOptions = dict(timeField=gymFields.index('timestamp'),
                                  fields=[('attendeeCount', sum),
                                          ('consumption', sum),
                                          ('timestamp', lambda x: x[0])],
                                  hours=5)

        handle = \
          tempfile.NamedTemporaryFile(prefix='agg_gym_hours_5',
            suffix='.csv',
            dir=os.path.dirname(findDataset(dataset)))
        outputFile = handle.name
        handle.close()

        print "Expected outputFile path: ", outputFile

        print "Files in the destination folder before the test:"
        print os.listdir(os.path.abspath(os.path.dirname(
            findDataset(dataset))))

        if os.path.isfile(outputFile):
            print "Removing existing outputFile: ", outputFile
            os.remove(outputFile)

        self.assertFalse(os.path.exists(outputFile),
                         msg="Shouldn't exist, but does: " + str(outputFile))

        result = generateDataset(aggregationOptions, dataset, outputFile)
        print "generateDataset() returned: ", result

        f1 = os.path.abspath(os.path.normpath(result))
        print "normalized generateDataset() result path: ", f1
        f2 = os.path.normpath(outputFile)
        print "normalized outputFile path: ", f2
        self.assertEqual(f1, f2)

        print "Checking for presence of outputFile: ", outputFile
        self.assertTrue(
            os.path.isfile(outputFile),
            msg=
            "Missing outputFile: %r; normalized generateDataset() result: %r" %
            (outputFile, f1))

        print "Files in the destination folder after the test:"
        print os.listdir(os.path.abspath(os.path.dirname(
            findDataset(dataset))))

        print result
        print '-' * 30

        return
Beispiel #5
0
  def test_AutoSpecialFields(self):
    # Cleanup old files
    #for f in glob.glob('*.*'):
    #  if 'auto_specials' in f:
    #    os.remove(f)


    fields = [('dummy', 'string', ''),
              ('timestamp', 'datetime', 'T'),
              ('reset', 'int', 'R'),
              ('sid', 'int', 'S'),
              ]

    records = (
      ['dummy-1', datetime.datetime(2000, 3, 1), 1, 1],
      ['dummy-2', datetime.datetime(2000, 3, 2), 0, 1],
      ['dummy-3', datetime.datetime(2000, 3, 3), 0, 1],
      ['dummy-4', datetime.datetime(2000, 3, 4), 1, 2],
      ['dummy-5', datetime.datetime(2000, 3, 5), 0, 2],
    )

    if not os.path.isdir('data'):
      os.makedirs('data')

    with FileRecordStream('data/auto_specials.csv', write=True, fields=fields) \
           as o:
      for r in records:
        o.appendRecord(r)

    # Aggregate just the dummy field, all the specials should be added
    ai = dict(
      fields=[('dummy', lambda x: x[0])],
      weeks=3
      )
    
    handle = \
      tempfile.NamedTemporaryFile(prefix='auto_specials', 
        suffix='.csv',
        dir='.')
    tempFile = handle.name
    handle.close()    

    outputFile = generateDataset(ai, 'auto_specials.csv', tempFile)

    result = []
    with FileRecordStream(outputFile) as f:
      print f.getFields()
      for r in f:
        result.append(r)

    self.assertEqual(result[0][2], 1) # reset
    self.assertEqual(result[0][3], 1) # seq id
    self.assertEqual(result[0][0], 'dummy-1')
    self.assertEqual(result[1][2], 1) # reset
    self.assertEqual(result[1][3], 2) # seq id
    self.assertEqual(result[1][0], 'dummy-4')

    return
Beispiel #6
0
def getDatasetsImpl(baseDatasets, generate, config):
  """ Implementation for description.py getDatasets() entry point function.
  Given a list of base datasets, returns a list of possibly transformed dataset
  paths to use; if config['aggregationInfo'] is disabled, then an identical
  dataset list is returned.  Optionally, generates new datasets by applying
  transformations specified in config['aggregationInfo'].

    baseDatasets: a dictionaary of base dataset paths, where each key/value pair
                    corresponds to a base (raw) dataset.  The keys are as generated
                    by our getBaseDatasets(); NOTE: the paths are absolute (fixed
                    up by the framework)

    NOTE: Note that the paths in the baseDatasets dict will have been adjusted by
      the prediction framework to point to actual dataset locations as found on
      disk, and are not likely to be the same as the (local) paths initially returned
      by getBaseDatasets

    generate:     if True and config['aggregationInfo'] is enabled, then new
                    datasets will be generated per config['aggregationInfo'];
                    otherwise, new datasets will not be generated

    config:       configuration dictionary from description.py

    Returns:      dictionary of dataset paths to use with same keys as in baseDatasets;
                    the values may differ from baseDatasets as follows: if
                    config['aggregationInfo']  is enabled, then new dataset paths
                    will be generated per config['aggregationInfo'].
  """


  # Aggregation info
  aggInfo = config['aggregationInfo'] if config['aggregationInfo'] else dict()

  datasets = dict()

  targetPaths = []
  for name in baseDatasets:
    if generate:
      # NOTE: Avoid processing the same dataset more than once, such as when the
      #  same dataset is used for training and inference in some tests
      tempPath = getFilename(aggInfo, baseDatasets[name])
      if tempPath not in targetPaths:
        path = generateDataset(aggInfo, baseDatasets[name])
        assert(path == tempPath)
      else:
        path = tempPath

      targetPaths.append(path)

    else:
      path = getFilename(aggInfo, baseDatasets[name])

    datasets[name] = path


  return datasets
def getDatasetsImpl(baseDatasets, generate, config):
  """ Implementation for description.py getDatasets() entry point function.
  Given a list of base datasets, returns a list of possibly transformed dataset
  paths to use; if config['aggregationInfo'] is disabled, then an identical
  dataset list is returned.  Optionally, generates new datasets by applying
  transformations specified in config['aggregationInfo'].

    baseDatasets: a dictionaary of base dataset paths, where each key/value pair
                    corresponds to a base (raw) dataset.  The keys are as generated
                    by our getBaseDatasets(); NOTE: the paths are absolute (fixed
                    up by the framework)

    NOTE: Note that the paths in the baseDatasets dict will have been adjusted by
      the prediction framework to point to actual dataset locations as found on
      disk, and are not likely to be the same as the (local) paths initially returned
      by getBaseDatasets

    generate:     if True and config['aggregationInfo'] is enabled, then new
                    datasets will be generated per config['aggregationInfo'];
                    otherwise, new datasets will not be generated

    config:       configuration dictionary from description.py

    Returns:      dictionary of dataset paths to use with same keys as in baseDatasets;
                    the values may differ from baseDatasets as follows: if
                    config['aggregationInfo']  is enabled, then new dataset paths
                    will be generated per config['aggregationInfo'].
  """


  # Aggregation info
  aggInfo = config['aggregationInfo'] if config['aggregationInfo'] else dict()

  datasets = dict()

  targetPaths = []
  for name in baseDatasets:
    if generate:
      # NOTE: Avoid processing the same dataset more than once, such as when the
      #  same dataset is used for training and inference in some tests
      tempPath = getFilename(aggInfo, baseDatasets[name])
      if tempPath not in targetPaths:
        path = generateDataset(aggInfo, baseDatasets[name])
        assert(path == tempPath)
      else:
        path = tempPath

      targetPaths.append(path)

    else:
      path = getFilename(aggInfo, baseDatasets[name])

    datasets[name] = path


  return datasets
Beispiel #8
0
    def test_AutoSpecialFields(self):
        # Cleanup old files
        #for f in glob.glob('*.*'):
        #  if 'auto_specials' in f:
        #    os.remove(f)

        fields = [
            ('dummy', 'string', ''),
            ('timestamp', 'datetime', 'T'),
            ('reset', 'int', 'R'),
            ('sid', 'int', 'S'),
        ]

        records = (
            ['dummy-1', datetime.datetime(2000, 3, 1), 1, 1],
            ['dummy-2', datetime.datetime(2000, 3, 2), 0, 1],
            ['dummy-3', datetime.datetime(2000, 3, 3), 0, 1],
            ['dummy-4', datetime.datetime(2000, 3, 4), 1, 2],
            ['dummy-5', datetime.datetime(2000, 3, 5), 0, 2],
        )

        if not os.path.isdir('data'):
            os.makedirs('data')

        with FileRecordStream('data/auto_specials.csv', write=True, fields=fields) \
               as o:
            for r in records:
                o.appendRecord(r)

        # Aggregate just the dummy field, all the specials should be added
        ai = dict(fields=[('dummy', lambda x: x[0])], weeks=3)

        handle = \
          tempfile.NamedTemporaryFile(prefix='auto_specials',
            suffix='.csv',
            dir='.')
        tempFile = handle.name
        handle.close()

        outputFile = generateDataset(ai, 'auto_specials.csv', tempFile)

        result = []
        with FileRecordStream(outputFile) as f:
            print f.getFields()
            for r in f:
                result.append(r)

        self.assertEqual(result[0][2], 1)  # reset
        self.assertEqual(result[0][3], 1)  # seq id
        self.assertEqual(result[0][0], 'dummy-1')
        self.assertEqual(result[1][2], 1)  # reset
        self.assertEqual(result[1][3], 2)  # seq id
        self.assertEqual(result[1][0], 'dummy-4')

        return
Beispiel #9
0
    def test_AutoSpecialFields(self):
        # Cleanup old files
        # for f in glob.glob('*.*'):
        #  if 'auto_specials' in f:
        #    os.remove(f)

        fields = [("dummy", "string", ""), ("timestamp", "datetime", "T"), ("reset", "int", "R"), ("sid", "int", "S")]

        records = (
            ["dummy-1", datetime.datetime(2000, 3, 1), 1, 1],
            ["dummy-2", datetime.datetime(2000, 3, 2), 0, 1],
            ["dummy-3", datetime.datetime(2000, 3, 3), 0, 1],
            ["dummy-4", datetime.datetime(2000, 3, 4), 1, 2],
            ["dummy-5", datetime.datetime(2000, 3, 5), 0, 2],
        )

        with FileRecordStream(
            resource_filename("nupic.datafiles", "auto_specials.csv"), write=True, fields=fields
        ) as o:
            for r in records:
                o.appendRecord(r)

        # Aggregate just the dummy field, all the specials should be added
        ai = dict(fields=[("dummy", lambda x: x[0])], weeks=3)

        handle = tempfile.NamedTemporaryFile(prefix="auto_specials", suffix=".csv", dir=".")
        tempFile = handle.name
        handle.close()

        outputFile = generateDataset(ai, "auto_specials.csv", tempFile)

        result = []
        with FileRecordStream(outputFile) as f:
            print f.getFields()
            for r in f:
                result.append(r)

        self.assertEqual(result[0][2], 1)  # reset
        self.assertEqual(result[0][3], 1)  # seq id
        self.assertEqual(result[0][0], "dummy-1")
        self.assertEqual(result[1][2], 1)  # reset
        self.assertEqual(result[1][3], 2)  # seq id
        self.assertEqual(result[1][0], "dummy-4")

        return
Beispiel #10
0
  def test_GapsInIrregularData(self):
    # Cleanup previous files if exist
    import glob
    for f in glob.glob('gap.*'):
      print 'Removing', f
      os.remove(f)

    #class TestParser(BaseParser):
    #  def __init__(self):
    #    def parseTimestamp(s):
    #      d,t = s.split()
    #      year, month, day = [int(x) for x in d.split('-')]
    #      hour, minute, second = [int(x) for x in t.split(':')]
    #      return datetime.datetime(year, month, day, hour, minute, second)
    #
    #    BaseParser.__init__(self,
    #                        [('dateTime', parseTimestamp),
    #                         ('sequenceId', int),
    #                         ('cardtype', int),
    #                         ('fraud', bool),
    #                         ('amount', float)],
    #                        delimiter=',')
    #  def parse(self, line):
    #    values = BaseParser.parse(self, line)
    #    return values

  #dateTime,cardnum,cardtype,fraud,amount
    data = """\
2009-04-03 19:05:06,129.3
2009-04-04 15:19:12,46.6
2009-04-07 02:54:04,30.32
2009-04-07 06:27:12,84.52
2009-04-07 06:42:21,21.1
2009-04-09 01:01:14,29.24
2009-04-09 06:47:42,99.76
2009-04-11 18:06:11,29.66
2009-04-11 18:12:53,148.32
2009-04-11 19:15:08,61.03
2009-04-15 19:25:40,53.14
2009-05-04 21:07:02,816.75
2009-05-04 21:08:27,686.07
2009-05-06 20:40:04,489.08
2009-05-06 20:40:42,586.9
2009-05-06 20:41:15,554.3
2009-05-06 20:41:51,652.11"""
    fields = [('timestamp', 'datetime', 'T'), ('amount', 'float', '')]
    with FileRecordStream(resource_filename('nupic.datafiles', 'gap.csv'), write=True, fields=fields) as f:
      lines = data.split('\n')
      for line in lines:
        t, a = line.split(',')

        components = t.split()

        yyyy, mm, dd = [int(x) for x in components[0].split('-')]
        h, m, s = [int(x) for x in components[1].split(':')]

        t = datetime.datetime(yyyy, mm, dd, h, m, s)
        a = float(a)
        f.appendRecord([t, a])

    aggregationOptions = dict(
      timeField='timestamp',
      fields=[('timestamp', lambda x: x[0]),
              ('amount', sum)],
      hours=24
      )


    handle = \
      tempfile.NamedTemporaryFile(prefix='agg_gap_hours_24', 
        suffix='.csv', 
        dir='nupic/datafiles')
    outputFile = handle.name
    handle.close()
    
    if os.path.isfile(outputFile):
      os.remove(outputFile)
    self.assertFalse(os.path.exists(outputFile),
                     msg="shouldn't exist, but does: " + str(outputFile))

    result = generateDataset(aggregationOptions, 'gap.csv', outputFile)
    self.assertEqual(
      os.path.normpath(os.path.abspath(outputFile)), os.path.normpath(result),
      msg="result = '%s'; outputFile = '%s'" % (result, outputFile))
    self.assertTrue(os.path.isfile(outputFile),
                    msg="outputFile missing or is not file: %r" % (outputFile))
    print outputFile
    print '-' * 30

    s = ''
    for r in FileRecordStream(outputFile):
      s += ', '.join([str(x) for x in r]) + '\n'

    expected = """\
2009-04-03 19:05:06, 175.9
2009-04-06 19:05:06, 135.94
2009-04-08 19:05:06, 129.0
2009-04-10 19:05:06, 177.98
2009-04-11 19:05:06, 61.03
2009-04-15 19:05:06, 53.14
2009-05-04 19:05:06, 1502.82
2009-05-06 19:05:06, 2282.39
"""

    self.assertEqual(s, expected)

    return
    def test_GapsInIrregularData(self):
        # Cleanup previous files if exist
        import glob
        for f in glob.glob('gap.*'):
            print 'Removing', f
            os.remove(f)

        #class TestParser(BaseParser):
        #  def __init__(self):
        #    def parseTimestamp(s):
        #      d,t = s.split()
        #      year, month, day = [int(x) for x in d.split('-')]
        #      hour, minute, second = [int(x) for x in t.split(':')]
        #      return datetime.datetime(year, month, day, hour, minute, second)
        #
        #    BaseParser.__init__(self,
        #                        [('dateTime', parseTimestamp),
        #                         ('sequenceId', int),
        #                         ('cardtype', int),
        #                         ('fraud', bool),
        #                         ('amount', float)],
        #                        delimiter=',')
        #  def parse(self, line):
        #    values = BaseParser.parse(self, line)
        #    return values

    #dateTime,cardnum,cardtype,fraud,amount
        data = """\
2009-04-03 19:05:06,129.3
2009-04-04 15:19:12,46.6
2009-04-07 02:54:04,30.32
2009-04-07 06:27:12,84.52
2009-04-07 06:42:21,21.1
2009-04-09 01:01:14,29.24
2009-04-09 06:47:42,99.76
2009-04-11 18:06:11,29.66
2009-04-11 18:12:53,148.32
2009-04-11 19:15:08,61.03
2009-04-15 19:25:40,53.14
2009-05-04 21:07:02,816.75
2009-05-04 21:08:27,686.07
2009-05-06 20:40:04,489.08
2009-05-06 20:40:42,586.9
2009-05-06 20:41:15,554.3
2009-05-06 20:41:51,652.11"""
        fields = [('timestamp', 'datetime', 'T'), ('amount', 'float', '')]
        with FileRecordStream(resource_filename('nupic.datafiles', 'gap.csv'),
                              write=True,
                              fields=fields) as f:
            lines = data.split('\n')
            for line in lines:
                t, a = line.split(',')

                components = t.split()

                yyyy, mm, dd = [int(x) for x in components[0].split('-')]
                h, m, s = [int(x) for x in components[1].split(':')]

                t = datetime.datetime(yyyy, mm, dd, h, m, s)
                a = float(a)
                f.appendRecord([t, a])

        aggregationOptions = dict(timeField='timestamp',
                                  fields=[('timestamp', lambda x: x[0]),
                                          ('amount', sum)],
                                  hours=24)


        handle = \
          tempfile.NamedTemporaryFile(prefix='agg_gap_hours_24',
            suffix='.csv',
            dir='.')
        outputFile = handle.name
        handle.close()

        if os.path.isfile(outputFile):
            os.remove(outputFile)
        self.assertFalse(os.path.exists(outputFile),
                         msg="shouldn't exist, but does: " + str(outputFile))

        result = generateDataset(aggregationOptions, 'gap.csv', outputFile)
        self.assertEqual(os.path.normpath(os.path.abspath(outputFile)),
                         os.path.normpath(result),
                         msg="result = '%s'; outputFile = '%s'" %
                         (result, outputFile))
        self.assertTrue(os.path.isfile(outputFile),
                        msg="outputFile missing or is not file: %r" %
                        (outputFile))
        print outputFile
        print '-' * 30

        s = ''
        for r in FileRecordStream(outputFile):
            s += ', '.join([str(x) for x in r]) + '\n'

        expected = """\
2009-04-03 19:05:06, 175.9
2009-04-06 19:05:06, 135.94
2009-04-08 19:05:06, 129.0
2009-04-10 19:05:06, 177.98
2009-04-11 19:05:06, 61.03
2009-04-15 19:05:06, 53.14
2009-05-04 19:05:06, 1502.82
2009-05-06 19:05:06, 2282.39
"""

        self.assertEqual(s, expected)

        return