def generateStats(filename, statsInfo, maxSamples = None, filters=[], cache=True): """Generate requested statistics for a dataset and cache to a file. If filename is None, then don't cache to a file""" # Sanity checking if not isinstance(statsInfo, dict): raise RuntimeError("statsInfo must be a dict -- " "found '%s' instead" % type(statsInfo)) filename = findDataset(filename) if cache: statsFilename = getStatsFilename(filename, statsInfo, filters) # Use cached stats if found AND if it has the right data if os.path.exists(statsFilename): try: r = pickle.load(open(statsFilename, "rb")) except: # Ok to ignore errors -- we will just re-generate the file print "Warning: unable to load stats for %s -- " \ "will regenerate" % filename r = dict() requestedKeys = set([s for s in statsInfo]) availableKeys = set(r.keys()) unavailableKeys = requestedKeys.difference(availableKeys) if len(unavailableKeys ) == 0: return r else: print "generateStats: re-generating stats file %s because " \ "keys %s are not available" % \ (filename, str(unavailableKeys)) os.remove(filename) print "Generating statistics for file '%s' with filters '%s'" % (filename, filters) sensor = RecordSensor() sensor.dataSource = FileRecordStream(filename) sensor.preEncodingFilters = filters # Convert collector description to collector object stats = [] for field in statsInfo: # field = key from statsInfo if statsInfo[field] == "number": # This wants a field name e.g. consumption and the field type as the value statsInfo[field] = NumberStatsCollector() elif statsInfo[field] == "category": statsInfo[field] = CategoryStatsCollector() else: raise RuntimeError("Unknown stats type '%s' for field '%s'" % (statsInfo[field], field)) # Now collect the stats if maxSamples is None: maxSamples = 500000 for i in xrange(maxSamples): try: record = sensor.getNextRecord() except StopIteration: break for (name, collector) in statsInfo.items(): collector.add(record[name]) del sensor # Assemble the results and return r = dict() for (field, collector) in statsInfo.items(): stats = collector.getStats() if field not in r: r[field] = stats else: r[field].update(stats) if cache: f = open(statsFilename, "wb") pickle.dump(r, f) f.close() # caller may need to know name of cached file r["_filename"] = statsFilename return r
def testDeltaFilter(self): """ data looks like: should generate deltas "t" "s" "dt" "ds" t 10 X t+1s 20 1s 10 t+1d 50 86399 30 r t+1d+1s 60 X r+1d+3s 65 2s 5 """ r = RecordSensor() filename = findDataset("extra/qa/delta.csv") datasource = FileRecordStream(filename) r.dataSource = datasource n = 50 encoder = MultiEncoder({ 'blah': dict(fieldname="s", type='ScalarEncoder', n=n, w=11, minval=0, maxval=100) }) r.encoder = encoder # Test #1 -- no deltas # Make sure we get a reset when the gym changes resetOut = numpy.zeros((1, ), dtype='float') sequenceIdOut = numpy.zeros((1, ), dtype='float') dataOut = numpy.zeros((n, ), dtype='float') sourceOut = numpy.zeros((1, ), dtype='float') categoryOut = numpy.zeros((1, ), dtype='float') outputs = dict(resetOut=resetOut, sourceOut=sourceOut, sequenceIdOut=sequenceIdOut, dataOut=dataOut, categoryOut=categoryOut) inputs = dict() r.verbosity = 0 r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 10) self.assertEqual(lr['_reset'], 1) self.assertTrue('dt' not in lr) self.assertTrue('ds' not in lr) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 20) self.assertEqual(lr['_reset'], 0) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 50) self.assertEqual(lr['_reset'], 0) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 60) self.assertEqual(lr['_reset'], 1) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3)) self.assertEqual(lr['s'], 65) self.assertEqual(lr['_reset'], 0) # Add filters r.preEncodingFilters = [DeltaFilter("s", "ds"), DeltaFilter("t", "dt")] r.rewind() # skip first record, which has a reset r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 20) self.assertEqual(lr['_reset'], 1) # this record should have a reset since # it is first of a sequence self.assertEqual(lr['dt'], 1) self.assertEqual(lr['ds'], 10) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 50) self.assertEqual(lr['_reset'], 0) self.assertEqual(lr['dt'], 3600 * 24 - 1) self.assertEqual(lr['ds'], 30) # next reset record is skipped r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual( lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3)) self.assertEqual(lr['s'], 65) self.assertEqual(lr['_reset'], 1) self.assertEqual(lr['dt'], 2) self.assertEqual(lr['ds'], 5)
def testDeltaFilter(self): """ data looks like: should generate deltas "t" "s" "dt" "ds" t 10 X t+1s 20 1s 10 t+1d 50 86399 30 r t+1d+1s 60 X r+1d+3s 65 2s 5 """ r = RecordSensor() filename = findDataset("extra/qa/delta.csv") datasource = FileRecordStream(filename) r.dataSource = datasource n = 50 encoder = MultiEncoder({'blah': dict(fieldname="s", type='ScalarEncoder', n=n, w=11, minval=0, maxval=100)}) r.encoder = encoder # Test #1 -- no deltas # Make sure we get a reset when the gym changes resetOut = numpy.zeros((1,), dtype='float') sequenceIdOut = numpy.zeros((1,), dtype='float') dataOut = numpy.zeros((n,), dtype='float') sourceOut = numpy.zeros((1,), dtype='float') categoryOut = numpy.zeros((1,), dtype='float') outputs = dict(resetOut=resetOut, sourceOut = sourceOut, sequenceIdOut = sequenceIdOut, dataOut = dataOut, categoryOut = categoryOut) inputs = dict() r.verbosity=0 r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 10) self.assertEqual(lr['_reset'], 1) self.assertTrue('dt' not in lr) self.assertTrue('ds' not in lr) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 20) self.assertEqual(lr['_reset'], 0) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 50) self.assertEqual(lr['_reset'], 0) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 60) self.assertEqual(lr['_reset'], 1) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3)) self.assertEqual(lr['s'], 65) self.assertEqual(lr['_reset'], 0) # Add filters r.preEncodingFilters = [DeltaFilter("s", "ds"), DeltaFilter("t", "dt")] r.rewind() # skip first record, which has a reset r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1)) self.assertEqual(lr['s'], 20) self.assertEqual(lr['_reset'], 1) # this record should have a reset since # it is first of a sequence self.assertEqual(lr['dt'], 1) self.assertEqual(lr['ds'], 10) r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0)) self.assertEqual(lr['s'], 50) self.assertEqual(lr['_reset'], 0) self.assertEqual(lr['dt'], 3600 * 24 - 1) self.assertEqual(lr['ds'], 30) # next reset record is skipped r.compute(inputs, outputs) lr = r.lastRecord self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3)) self.assertEqual(lr['s'], 65) self.assertEqual(lr['_reset'], 1) self.assertEqual(lr['dt'], 2) self.assertEqual(lr['ds'], 5)