def __init__(self, w, categoryList, name="category", verbosity=0): self.encoders = None self.verbosity = verbosity # number of categories includes "unknown" self.ncategories = len(categoryList) + 1 self.categoryToIndex = dict() self.indexToCategory = dict() self.indexToCategory[0] = "<UNKNOWN>" for i in xrange(len(categoryList)): self.categoryToIndex[categoryList[i]] = i + 1 self.indexToCategory[i + 1] = categoryList[i] self.encoder = ScalarEncoder(w, minval=0, maxval=self.ncategories - 1, radius=1, periodic=False) self.width = w * self.ncategories assert self.encoder.getWidth() == self.width self.description = [(name, 0)] self.name = name # These are used to support the topDownCompute method self._topDownMappingM = None # This gets filled in by getBucketValues self._bucketValues = None
def __init__(self, w=5, resolution=1.0, minval=0.10, maxval=10000, name="log", verbosity=0): self.encoders = None self.verbosity = verbosity self.minScaledValue = int(10 * math.log10(minval)) self.maxScaledValue = int(math.ceil(10 * math.log10(maxval))) assert self.maxScaledValue > self.minScaledValue self.minval = 10**(self.minScaledValue / 10.0) self.maxval = 10**(self.maxScaledValue / 10.0) # Note: passing resolution=1 causes the test to topDownCompute # test to fail. Fixed for now by always converting to float, # but should find the root cause. self.encoder = ScalarEncoder(w=w, minval=self.minScaledValue, maxval=self.maxScaledValue, periodic=False, resolution=float(resolution)) self.width = self.encoder.getWidth() self.description = [(name, 0)] self.name = name # This list is created by getBucketValues() the first time it is called, # and re-created whenever our buckets would be re-arranged. self._bucketValues = None
def __init__(self, w, categoryList, name="category", verbosity=0): self.encoders = None self.verbosity = verbosity # number of categories includes "unknown" self.ncategories = len(categoryList) + 1 self.categoryToIndex = dict() self.indexToCategory = dict() self.indexToCategory[0] = "<UNKNOWN>" for i in xrange(len(categoryList)): self.categoryToIndex[categoryList[i]] = i+1 self.indexToCategory[i+1] = categoryList[i] self.encoder = ScalarEncoder(w, minval=0, maxval=self.ncategories - 1, radius=1, periodic=False) self.width = w * self.ncategories assert self.encoder.getWidth() == self.width self.description = [(name, 0)] self.name = name # These are used to support the topDownCompute method self._topDownMappingM = None # This gets filled in by getBucketValues self._bucketValues = None
def __init__(self, w = 5, resolution = 1.0, minval=0.10, maxval=10000, name="log", verbosity=0): self.encoders = None self.verbosity = verbosity self.minScaledValue = int(10 * math.log10(minval)) self.maxScaledValue = int(math.ceil(10 * math.log10(maxval))) assert self.maxScaledValue > self.minScaledValue self.minval = 10 ** (self.minScaledValue / 10.0) self.maxval = 10 ** (self.maxScaledValue / 10.0) # Note: passing resolution=1 causes the test to topDownCompute # test to fail. Fixed for now by always converting to float, # but should find the root cause. self.encoder = ScalarEncoder(w=w, minval = self.minScaledValue, maxval=self.maxScaledValue, periodic=False, resolution=float(resolution)) self.width = self.encoder.getWidth() self.description = [(name, 0)] self.name = name # This list is created by getBucketValues() the first time it is called, # and re-created whenever our buckets would be re-arranged. self._bucketValues = None
class CategoryEncoder(Encoder): """Encodes a list of discrete categories (described by strings), that aren't related to each other, so we never emit a mixture of categories. The value of zero is reserved for "unknown category" Internally we use a ScalarEncoder with a radius of 1, but since we only encode integers, we never get mixture outputs. The SDRCategoryEncoder uses a different method to encode categories""" ############################################################################ def __init__(self, w, categoryList, name="category", verbosity=0): self.encoders = None self.verbosity = verbosity # number of categories includes "unknown" self.ncategories = len(categoryList) + 1 self.categoryToIndex = dict() self.indexToCategory = dict() self.indexToCategory[0] = "<UNKNOWN>" for i in xrange(len(categoryList)): self.categoryToIndex[categoryList[i]] = i+1 self.indexToCategory[i+1] = categoryList[i] self.encoder = ScalarEncoder(w, minval=0, maxval=self.ncategories - 1, radius=1, periodic=False) self.width = w * self.ncategories assert self.encoder.getWidth() == self.width self.description = [(name, 0)] self.name = name # These are used to support the topDownCompute method self._topDownMappingM = None # This gets filled in by getBucketValues self._bucketValues = None ############################################################################ def getDecoderOutputFieldTypes(self): """ [Encoder class virtual method override] """ # TODO: change back to string meta-type after the decoding logic is fixed # to output strings instead of internal index values. #return (FieldMetaType.string,) return (FieldMetaType.integer,) ############################################################################ def getWidth(self): return self.width ############################################################################ def getDescription(self): return self.description ############################################################################ def getScalars(self, input): """ See method description in base.py """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: return numpy.array([None]) else: return numpy.array([self.categoryToIndex.get(input, 0)]) ############################################################################ def getBucketIndices(self, input): """ See method description in base.py """ # Get the bucket index from the underlying scalar encoder if input == SENTINEL_VALUE_FOR_MISSING_DATA: return [None] else: return self.encoder.getBucketIndices(self.categoryToIndex.get(input, 0)) ############################################################################ def encodeIntoArray(self, input, output): # if not found, we encode category 0 if input == SENTINEL_VALUE_FOR_MISSING_DATA: output[0:] = 0 val = "<missing>" else: val = self.categoryToIndex.get(input, 0) self.encoder.encodeIntoArray(val, output) if self.verbosity >= 2: print "input:", input, "va:", val, "output:", output print "decoded:", self.decodedToStr(self.decode(output)) ############################################################################ def decode(self, encoded, parentFieldName=''): """ See the function description in base.py """ # Get the scalar values from the underlying scalar encoder (fieldsDict, fieldNames) = self.encoder.decode(encoded) if len(fieldsDict) == 0: return (fieldsDict, fieldNames) # Expect only 1 field assert(len(fieldsDict) == 1) # Get the list of categories the scalar values correspond to and # generate the description from the category name(s). (inRanges, inDesc) = fieldsDict.values()[0] outRanges = [] desc = "" for (minV, maxV) in inRanges: minV = int(round(minV)) maxV = int(round(maxV)) outRanges.append((minV, maxV)) while minV <= maxV: if len(desc) > 0: desc += ", " desc += self.indexToCategory[minV] minV += 1 # Return result if parentFieldName != '': fieldName = "%s.%s" % (parentFieldName, self.name) else: fieldName = self.name return ({fieldName: (outRanges, desc)}, [fieldName]) ############################################################################ def closenessScores(self, expValues, actValues, fractional=True,): """ See the function description in base.py kwargs will have the keyword "fractional", which is ignored by this encoder """ expValue = expValues[0] actValue = actValues[0] if expValue == actValue: closeness = 1.0 else: closeness = 0.0 if not fractional: closeness = 1.0 - closeness #print "category::", "expValue:", expValue, "actValue:", actValue, \ # "closeness", closeness #import pdb; pdb.set_trace() return numpy.array([closeness]) ############################################################################ def getBucketValues(self): """ See the function description in base.py """ if self._bucketValues is None: numBuckets = len(self.encoder.getBucketValues()) self._bucketValues = [] for bucketIndex in range(numBuckets): self._bucketValues.append(self.getBucketInfo([bucketIndex])[0].value) return self._bucketValues ############################################################################ def getBucketInfo(self, buckets): """ See the function description in base.py """ # For the category encoder, the bucket index is the category index bucketInfo = self.encoder.getBucketInfo(buckets)[0] categoryIndex = int(round(bucketInfo.value)) category = self.indexToCategory[categoryIndex] return [EncoderResult(value=category, scalar=categoryIndex, encoding=bucketInfo.encoding)] ############################################################################ def topDownCompute(self, encoded): """ See the function description in base.py """ encoderResult = self.encoder.topDownCompute(encoded)[0] value = encoderResult.value categoryIndex = int(round(value)) category = self.indexToCategory[categoryIndex] return EncoderResult(value=category, scalar=categoryIndex, encoding=encoderResult.encoding)
def __init__(self, season=0, dayOfWeek=0, weekend=0, holiday=0, timeOfDay=0, customDays=0, name=''): self.width = 0 self.description = [] self.name = name # This will contain a list of (name, encoder, offset) tuples for use by # the decode() method self.encoders = [] self.seasonEncoder = None if season != 0: # Ignore leapyear differences -- assume 366 days in a year # Radius = 91.5 days = length of season # Value is number of days since beginning of year (0 - 355) if hasattr(season, "__getitem__"): w = season[0] radius = season[1] else: w = season radius = 91.5 self.seasonEncoder = ScalarEncoder(w=w, minval=0, maxval=366, radius=radius, periodic=True, name="season") self.seasonOffset = self.width self.width += self.seasonEncoder.getWidth() self.description.append(("season", self.seasonOffset)) self.encoders.append( ("season", self.seasonEncoder, self.seasonOffset)) self.dayOfWeekEncoder = None if dayOfWeek != 0: # Value is day of week (floating point) # Radius is 1 day if hasattr(dayOfWeek, "__getitem__"): w = dayOfWeek[0] radius = dayOfWeek[1] else: w = dayOfWeek radius = 1 self.dayOfWeekEncoder = ScalarEncoder(w=w, minval=0, maxval=7, radius=radius, periodic=True, name="day of week") self.dayOfWeekOffset = self.width self.width += self.dayOfWeekEncoder.getWidth() self.description.append(("day of week", self.dayOfWeekOffset)) self.encoders.append( ("day of week", self.dayOfWeekEncoder, self.dayOfWeekOffset)) self.weekendEncoder = None if weekend != 0: # Binary value. Not sure if this makes sense. Also is somewhat redundant # with dayOfWeek #Append radius if it was not provided if not hasattr(weekend, "__getitem__"): weekend = (weekend, 1) self.weekendEncoder = ScalarEncoder(w=weekend[0], minval=0, maxval=1, periodic=False, radius=weekend[1], name="weekend") self.weekendOffset = self.width self.width += self.weekendEncoder.getWidth() self.description.append(("weekend", self.weekendOffset)) self.encoders.append( ("weekend", self.weekendEncoder, self.weekendOffset)) #Set up custom days encoder, first argument in tuple is width #second is either a single day of the week or a list of the days #you want encoded as ones. self.customDaysEncoder = None if customDays != 0: customDayEncoderName = "" daysToParse = [] assert len( customDays) == 2, "Please provide a w and the desired days" if isinstance(customDays[1], list): for day in customDays[1]: customDayEncoderName += str(day) + " " daysToParse = customDays[1] elif isinstance(customDays[1], str): customDayEncoderName += customDays[1] daysToParse = [customDays[1]] else: assert False, "You must provide either a list of days or a single day" #Parse days self.customDays = [] for day in daysToParse: if (day.lower() in ["mon", "monday"]): self.customDays += [0] elif day.lower() in ["tue", "tuesday"]: self.customDays += [1] elif day.lower() in ["wed", "wednesday"]: self.customDays += [2] elif day.lower() in ["thu", "thursday"]: self.customDays += [3] elif day.lower() in ["fri", "friday"]: self.customDays += [4] elif day.lower() in ["sat", "saturday"]: self.customDays += [5] elif day.lower() in ["sun", "sunday"]: self.customDays += [6] else: assert False, "Unable to understand %s as a day of week" % str( day) self.customDaysEncoder = ScalarEncoder(w=customDays[0], minval=0, maxval=1, periodic=False, radius=1, name=customDayEncoderName) self.customDaysOffset = self.width self.width += self.customDaysEncoder.getWidth() self.description.append(("customdays", self.customDaysOffset)) self.encoders.append( ("customdays", self.customDaysEncoder, self.customDaysOffset)) self.holidayEncoder = None if holiday != 0: # A "continuous" binary value. = 1 on the holiday itself and smooth ramp # 0->1 on the day before the holiday and 1->0 on the day after the holiday. self.holidayEncoder = ScalarEncoder(w=holiday, minval=0, maxval=1, periodic=False, radius=1, name="holiday") self.holidayOffset = self.width self.width += self.holidayEncoder.getWidth() self.description.append(("holiday", self.holidayOffset)) self.encoders.append( ("holiday", self.holidayEncoder, self.holidayOffset)) self.timeOfDayEncoder = None if timeOfDay != 0: # Value is time of day in hours # Radius = 4 hours, e.g. morning, afternoon, evening, early night, # late night, etc. if hasattr(timeOfDay, "__getitem__"): w = timeOfDay[0] radius = timeOfDay[1] else: w = timeOfDay radius = 4 self.timeOfDayEncoder = ScalarEncoder(w=w, minval=0, maxval=24, periodic=True, radius=radius, name="time of day") self.timeOfDayOffset = self.width self.width += self.timeOfDayEncoder.getWidth() self.description.append(("time of day", self.timeOfDayOffset)) self.encoders.append( ("time of day", self.timeOfDayEncoder, self.timeOfDayOffset))
class DateEncoder(Encoder): """A date encoder encodes a date according to encoding parameters specified in its constructor. The input to a date encoder is a datetime.datetime object. The output is the concatenation of several sub-encodings, each of which encodes a different aspect of the date. Which sub-encodings are present, and details of those sub-encodings, are specified in the DateEncoder constructor. Each parameter describes one attribute to encode. By default, the attribute is not encoded. season (season of the year; units = day): (int) width of attribute; default radius = 91.5 days (1 season) (tuple) season[0] = width; season[1] = radius dayOfWeek (monday = 0; units = day) (int) width of attribute; default radius = 1 day (tuple) dayOfWeek[0] = width; dayOfWeek[1] = radius weekend (boolean: 0, 1) (int) width of attribute holiday (boolean: 0, 1) (int) width of attribute timeOfday (midnight = 0; units = hour) (int) width of attribute: default radius = 4 hours (tuple) timeOfDay[0] = width; timeOfDay[1] = radius """ ############################################################################ def __init__(self, season=0, dayOfWeek=0, weekend=0, holiday=0, timeOfDay=0, customDays=0, name=''): self.width = 0 self.description = [] self.name = name # This will contain a list of (name, encoder, offset) tuples for use by # the decode() method self.encoders = [] self.seasonEncoder = None if season != 0: # Ignore leapyear differences -- assume 366 days in a year # Radius = 91.5 days = length of season # Value is number of days since beginning of year (0 - 355) if hasattr(season, "__getitem__"): w = season[0] radius = season[1] else: w = season radius = 91.5 self.seasonEncoder = ScalarEncoder(w=w, minval=0, maxval=366, radius=radius, periodic=True, name="season") self.seasonOffset = self.width self.width += self.seasonEncoder.getWidth() self.description.append(("season", self.seasonOffset)) self.encoders.append( ("season", self.seasonEncoder, self.seasonOffset)) self.dayOfWeekEncoder = None if dayOfWeek != 0: # Value is day of week (floating point) # Radius is 1 day if hasattr(dayOfWeek, "__getitem__"): w = dayOfWeek[0] radius = dayOfWeek[1] else: w = dayOfWeek radius = 1 self.dayOfWeekEncoder = ScalarEncoder(w=w, minval=0, maxval=7, radius=radius, periodic=True, name="day of week") self.dayOfWeekOffset = self.width self.width += self.dayOfWeekEncoder.getWidth() self.description.append(("day of week", self.dayOfWeekOffset)) self.encoders.append( ("day of week", self.dayOfWeekEncoder, self.dayOfWeekOffset)) self.weekendEncoder = None if weekend != 0: # Binary value. Not sure if this makes sense. Also is somewhat redundant # with dayOfWeek #Append radius if it was not provided if not hasattr(weekend, "__getitem__"): weekend = (weekend, 1) self.weekendEncoder = ScalarEncoder(w=weekend[0], minval=0, maxval=1, periodic=False, radius=weekend[1], name="weekend") self.weekendOffset = self.width self.width += self.weekendEncoder.getWidth() self.description.append(("weekend", self.weekendOffset)) self.encoders.append( ("weekend", self.weekendEncoder, self.weekendOffset)) #Set up custom days encoder, first argument in tuple is width #second is either a single day of the week or a list of the days #you want encoded as ones. self.customDaysEncoder = None if customDays != 0: customDayEncoderName = "" daysToParse = [] assert len( customDays) == 2, "Please provide a w and the desired days" if isinstance(customDays[1], list): for day in customDays[1]: customDayEncoderName += str(day) + " " daysToParse = customDays[1] elif isinstance(customDays[1], str): customDayEncoderName += customDays[1] daysToParse = [customDays[1]] else: assert False, "You must provide either a list of days or a single day" #Parse days self.customDays = [] for day in daysToParse: if (day.lower() in ["mon", "monday"]): self.customDays += [0] elif day.lower() in ["tue", "tuesday"]: self.customDays += [1] elif day.lower() in ["wed", "wednesday"]: self.customDays += [2] elif day.lower() in ["thu", "thursday"]: self.customDays += [3] elif day.lower() in ["fri", "friday"]: self.customDays += [4] elif day.lower() in ["sat", "saturday"]: self.customDays += [5] elif day.lower() in ["sun", "sunday"]: self.customDays += [6] else: assert False, "Unable to understand %s as a day of week" % str( day) self.customDaysEncoder = ScalarEncoder(w=customDays[0], minval=0, maxval=1, periodic=False, radius=1, name=customDayEncoderName) self.customDaysOffset = self.width self.width += self.customDaysEncoder.getWidth() self.description.append(("customdays", self.customDaysOffset)) self.encoders.append( ("customdays", self.customDaysEncoder, self.customDaysOffset)) self.holidayEncoder = None if holiday != 0: # A "continuous" binary value. = 1 on the holiday itself and smooth ramp # 0->1 on the day before the holiday and 1->0 on the day after the holiday. self.holidayEncoder = ScalarEncoder(w=holiday, minval=0, maxval=1, periodic=False, radius=1, name="holiday") self.holidayOffset = self.width self.width += self.holidayEncoder.getWidth() self.description.append(("holiday", self.holidayOffset)) self.encoders.append( ("holiday", self.holidayEncoder, self.holidayOffset)) self.timeOfDayEncoder = None if timeOfDay != 0: # Value is time of day in hours # Radius = 4 hours, e.g. morning, afternoon, evening, early night, # late night, etc. if hasattr(timeOfDay, "__getitem__"): w = timeOfDay[0] radius = timeOfDay[1] else: w = timeOfDay radius = 4 self.timeOfDayEncoder = ScalarEncoder(w=w, minval=0, maxval=24, periodic=True, radius=radius, name="time of day") self.timeOfDayOffset = self.width self.width += self.timeOfDayEncoder.getWidth() self.description.append(("time of day", self.timeOfDayOffset)) self.encoders.append( ("time of day", self.timeOfDayEncoder, self.timeOfDayOffset)) ############################################################################ def getWidth(self): return self.width ############################################################################ def getScalarNames(self, parentFieldName=''): """ See method description in base.py """ names = [] # This forms a name which is the concatenation of the parentFieldName # passed in and the encoder's own name. def _formFieldName(encoder): if parentFieldName == '': return encoder.name else: return '%s.%s' % (parentFieldName, encoder.name) # ------------------------------------------------------------------------- # Get the scalar values for each sub-field if self.seasonEncoder is not None: names.append(_formFieldName(self.seasonEncoder)) if self.dayOfWeekEncoder is not None: names.append(_formFieldName(self.dayOfWeekEncoder)) if self.customDaysEncoder is not None: names.append(_formFieldName(self.customDaysEncoder)) if self.weekendEncoder is not None: names.append(_formFieldName(self.weekendEncoder)) if self.holidayEncoder is not None: names.append(_formFieldName(self.holidayEncoder)) if self.timeOfDayEncoder is not None: names.append(_formFieldName(self.timeOfDayEncoder)) return names ############################################################################ def getEncodedValues(self, input): """ See method description in base.py """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: return numpy.array([None]) assert isinstance(input, datetime.datetime) values = [] # ------------------------------------------------------------------------- # Get the scalar values for each sub-field timetuple = input.timetuple() timeOfDay = timetuple.tm_hour + float(timetuple.tm_min) / 60.0 if self.seasonEncoder is not None: dayOfYear = timetuple.tm_yday # input.timetuple() computes the day of year 1 based, so convert to 0 based values.append(dayOfYear - 1) if self.dayOfWeekEncoder is not None: dayOfWeek = timetuple.tm_wday #+ timeOfDay / 24.0 values.append(dayOfWeek) if self.weekendEncoder is not None: # saturday, sunday or friday evening if timetuple.tm_wday == 6 or timetuple.tm_wday == 5 \ or (timetuple.tm_wday == 4 and timeOfDay > 18): weekend = 1 else: weekend = 0 values.append(weekend) if self.customDaysEncoder is not None: if timetuple.tm_wday in self.customDays: customDay = 1 else: customDay = 0 values.append(customDay) if self.holidayEncoder is not None: # A "continuous" binary value. = 1 on the holiday itself and smooth ramp # 0->1 on the day before the holiday and 1->0 on the day after the holiday. # Currently the only holiday we know about is December 25 # holidays is a list of holidays that occur on a fixed date every year holidays = [(12, 25)] val = 0 for h in holidays: # hdate is midnight on the holiday hdate = datetime.datetime(timetuple.tm_year, h[0], h[1], 0, 0, 0) if input > hdate: diff = input - hdate if diff.days == 0: # return 1 on the holiday itself val = 1 break elif diff.days == 1: # ramp smoothly from 1 -> 0 on the next day val = 1.0 - (float(diff.seconds) / (86400)) break else: diff = hdate - input if diff.days == 0: # ramp smoothly from 0 -> 1 on the previous day val = 1.0 - (float(diff.seconds) / 86400) values.append(val) if self.timeOfDayEncoder is not None: values.append(timeOfDay) return values ############################################################################ def getScalars(self, input): """ See method description in base.py Parameters: ----------------------------------------------------------------------- input: A datetime object representing the time being encoded Returns: A numpy array of the corresponding scalar values in the following order: [season, dayOfWeek, weekend, holiday, timeOfDay] Note: some of these fields might be omitted if they were not specified in the encoder """ return numpy.array(self.getEncodedValues(input)) ############################################################################ def getBucketIndices(self, input): """ See method description in base.py """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: # Encoder each sub-field return [None] * len(self.encoders) else: assert isinstance(input, datetime.datetime) # Get the scalar values for each sub-field scalars = self.getScalars(input) # Encoder each sub-field result = [] for i in xrange(len(self.encoders)): (name, encoder, offset) = self.encoders[i] result.extend(encoder.getBucketIndices(scalars[i])) return result ############################################################################ def encodeIntoArray(self, input, output): """ See method description in base.py """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: output[0:] = 0 else: if not isinstance(input, datetime.datetime): raise ValueError( "Input is type %s, expected datetime. Value: %s" % (type(input), str(input))) # Get the scalar values for each sub-field scalars = self.getScalars(input) # Encoder each sub-field for i in xrange(len(self.encoders)): (name, encoder, offset) = self.encoders[i] encoder.encodeIntoArray(scalars[i], output[offset:]) ############################################################################ def getDescription(self): return self.description
class LogEncoder(Encoder): """A Log encoder represents a floating point value on a logarithmic (decibel) scale. valueToEncode = 10 * log10(input) The default resolution (minimum difference in scaled values which is guaranteed to propduce different outputs) is 1 decibel. For example, the scaled values 10 and 11 will be distinguishable in the output. In terms of the original input values, this means 10^1 (10) and 10^1.1 (12.5) will be distinguishable. resolution -- encoder resolution, in terms of scaled values. Default: 1 decibel minval -- must be greater than 0. Lower values are reset to this value maxval -- Higher values are reset to this value """ def __init__(self, w=5, resolution=1.0, minval=0.10, maxval=10000, name="log", verbosity=0): self.encoders = None self.verbosity = verbosity self.minScaledValue = int(10 * math.log10(minval)) self.maxScaledValue = int(math.ceil(10 * math.log10(maxval))) assert self.maxScaledValue > self.minScaledValue self.minval = 10 ** (self.minScaledValue / 10.0) self.maxval = 10 ** (self.maxScaledValue / 10.0) # Note: passing resolution=1 causes the test to topDownCompute # test to fail. Fixed for now by always converting to float, # but should find the root cause. self.encoder = ScalarEncoder( w=w, minval=self.minScaledValue, maxval=self.maxScaledValue, periodic=False, resolution=float(resolution) ) self.width = self.encoder.getWidth() self.description = [(name, 0)] self.name = name # This list is created by getBucketValues() the first time it is called, # and re-created whenever our buckets would be re-arranged. self._bucketValues = None ############################################################################ def getWidth(self): return self.width ############################################################################ def getDescription(self): return self.description ############################################################################ def _getScaledValue(self, input): """ Convert the input, which is in normal space, into log space """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: return None else: val = input if val < self.minval: val = self.minval elif val > self.maxval: val = self.maxval scaledVal = 10 * math.log10(val) return scaledVal ############################################################################ def getBucketIndices(self, input): """ See the function description in base.py """ # Get the scaled value scaledVal = self._getScaledValue(input) if scaledVal is None: return [None] else: return self.encoder.getBucketIndices(scaledVal) ############################################################################ def encodeIntoArray(self, input, output): """ See the function description in base.py """ # Get the scaled value scaledVal = self._getScaledValue(input) if scaledVal is None: output[0:] = 0 else: self.encoder.encodeIntoArray(scaledVal, output) if self.verbosity >= 2: print "input:", input, "scaledVal:", scaledVal, "output:", output print "decoded:", self.decodedToStr(self.decode(output)) ############################################################################ def decode(self, encoded, parentFieldName=""): """ See the function description in base.py """ # Get the scalar values from the underlying scalar encoder (fieldsDict, fieldNames) = self.encoder.decode(encoded) if len(fieldsDict) == 0: return (fieldsDict, fieldNames) # Expect only 1 field assert len(fieldsDict) == 1 # Convert each range into normal space (inRanges, inDesc) = fieldsDict.values()[0] outRanges = [] for (minV, maxV) in inRanges: outRanges.append((math.pow(10, minV / 10.0), math.pow(10, maxV / 10.0))) # Generate a text description of the ranges desc = "" numRanges = len(outRanges) for i in xrange(numRanges): if outRanges[i][0] != outRanges[i][1]: desc += "%.2f-%.2f" % (outRanges[i][0], outRanges[i][1]) else: desc += "%.2f" % (outRanges[i][0]) if i < numRanges - 1: desc += ", " # Return result if parentFieldName != "": fieldName = "%s.%s" % (parentFieldName, self.name) else: fieldName = self.name return ({fieldName: (outRanges, desc)}, [fieldName]) ############################################################################ def getBucketValues(self): """ See the function description in base.py """ # Need to re-create? if self._bucketValues is None: scaledValues = self.encoder.getBucketValues() self._bucketValues = [] for scaledValue in scaledValues: value = math.pow(10, scaledValue / 10.0) self._bucketValues.append(value) return self._bucketValues ############################################################################ def getBucketInfo(self, buckets): """ See the function description in base.py """ scaledResult = self.encoder.getBucketInfo(buckets)[0] scaledValue = scaledResult.value value = math.pow(10, scaledValue / 10.0) return [EncoderResult(value=value, scalar=value, encoding=scaledResult.encoding)] ############################################################################ def topDownCompute(self, encoded): """ See the function description in base.py """ scaledResult = self.encoder.topDownCompute(encoded)[0] scaledValue = scaledResult.value value = math.pow(10, scaledValue / 10.0) return EncoderResult(value=value, scalar=value, encoding=scaledResult.encoding) ############################################################################ def closenessScores(self, expValues, actValues, fractional=True): """ See the function description in base.py """ # Compute the percent error in log space if expValues[0] > 0: expValue = 10 * math.log10(expValues[0]) else: expValue = self.minScaledValue if actValues[0] > 0: actValue = 10 * math.log10(actValues[0]) else: actValue = self.minScaledValue if fractional: err = abs(expValue - actValue) pctErr = err / (self.maxScaledValue - self.minScaledValue) pctErr = min(1.0, pctErr) closeness = 1.0 - pctErr else: err = abs(expValue - actValue) closeness = err # print "log::", "expValue:", expValues[0], "actValue:", actValues[0], \ # "closeness", closeness # import pdb; pdb.set_trace() return numpy.array([closeness])
def __init__(self, season=0, dayOfWeek=0, weekend=0, holiday=0, timeOfDay=0, customDays=0, name = ''): self.width = 0 self.description = [] self.name = name # This will contain a list of (name, encoder, offset) tuples for use by # the decode() method self.encoders = [] self.seasonEncoder = None if season != 0: # Ignore leapyear differences -- assume 366 days in a year # Radius = 91.5 days = length of season # Value is number of days since beginning of year (0 - 355) if hasattr(season, "__getitem__"): w = season[0] radius = season[1] else: w = season radius = 91.5 self.seasonEncoder = ScalarEncoder(w = w, minval=0, maxval=366, radius=radius, periodic=True, name="season") self.seasonOffset = self.width self.width += self.seasonEncoder.getWidth() self.description.append(("season", self.seasonOffset)) self.encoders.append(("season", self.seasonEncoder, self.seasonOffset)) self.dayOfWeekEncoder = None if dayOfWeek != 0: # Value is day of week (floating point) # Radius is 1 day if hasattr(dayOfWeek, "__getitem__"): w = dayOfWeek[0] radius = dayOfWeek[1] else: w = dayOfWeek radius = 1 self.dayOfWeekEncoder = ScalarEncoder(w = w, minval=0, maxval=7, radius=radius, periodic=True, name="day of week") self.dayOfWeekOffset = self.width self.width += self.dayOfWeekEncoder.getWidth() self.description.append(("day of week", self.dayOfWeekOffset)) self.encoders.append(("day of week", self.dayOfWeekEncoder, self.dayOfWeekOffset)) self.weekendEncoder = None if weekend != 0: # Binary value. Not sure if this makes sense. Also is somewhat redundant # with dayOfWeek #Append radius if it was not provided if not hasattr(weekend, "__getitem__"): weekend = (weekend,1) self.weekendEncoder = ScalarEncoder(w = weekend[0], minval = 0, maxval=1, periodic=False, radius=weekend[1], name="weekend") self.weekendOffset = self.width self.width += self.weekendEncoder.getWidth() self.description.append(("weekend", self.weekendOffset)) self.encoders.append(("weekend", self.weekendEncoder, self.weekendOffset)) #Set up custom days encoder, first argument in tuple is width #second is either a single day of the week or a list of the days #you want encoded as ones. self.customDaysEncoder = None if customDays !=0: customDayEncoderName = "" daysToParse = [] assert len(customDays)==2, "Please provide a w and the desired days" if isinstance(customDays[1], list): for day in customDays[1]: customDayEncoderName+=str(day)+" " daysToParse=customDays[1] elif isinstance(customDays[1], str): customDayEncoderName+=customDays[1] daysToParse = [customDays[1]] else: assert False, "You must provide either a list of days or a single day" #Parse days self.customDays = [] for day in daysToParse: if(day.lower() in ["mon","monday"]): self.customDays+=[0] elif day.lower() in ["tue","tuesday"]: self.customDays+=[1] elif day.lower() in ["wed","wednesday"]: self.customDays+=[2] elif day.lower() in ["thu","thursday"]: self.customDays+=[3] elif day.lower() in ["fri","friday"]: self.customDays+=[4] elif day.lower() in ["sat","saturday"]: self.customDays+=[5] elif day.lower() in ["sun","sunday"]: self.customDays+=[6] else: assert False, "Unable to understand %s as a day of week" % str(day) self.customDaysEncoder = ScalarEncoder(w=customDays[0], minval = 0, maxval=1, periodic=False, radius=1, name=customDayEncoderName) self.customDaysOffset = self.width self.width += self.customDaysEncoder.getWidth() self.description.append(("customdays", self.customDaysOffset)) self.encoders.append(("customdays", self.customDaysEncoder, self.customDaysOffset)) self.holidayEncoder = None if holiday != 0: # A "continuous" binary value. = 1 on the holiday itself and smooth ramp # 0->1 on the day before the holiday and 1->0 on the day after the holiday. self.holidayEncoder = ScalarEncoder(w = holiday, minval = 0, maxval=1, periodic=False, radius=1, name="holiday") self.holidayOffset = self.width self.width += self.holidayEncoder.getWidth() self.description.append(("holiday", self.holidayOffset)) self.encoders.append(("holiday", self.holidayEncoder, self.holidayOffset)) self.timeOfDayEncoder = None if timeOfDay != 0: # Value is time of day in hours # Radius = 4 hours, e.g. morning, afternoon, evening, early night, # late night, etc. if hasattr(timeOfDay, "__getitem__"): w = timeOfDay[0] radius = timeOfDay[1] else: w = timeOfDay radius = 4 self.timeOfDayEncoder = ScalarEncoder(w = w, minval=0, maxval=24, periodic=True, radius=radius, name="time of day") self.timeOfDayOffset = self.width self.width += self.timeOfDayEncoder.getWidth() self.description.append(("time of day", self.timeOfDayOffset)) self.encoders.append(("time of day", self.timeOfDayEncoder, self.timeOfDayOffset))
class DateEncoder(Encoder): """A date encoder encodes a date according to encoding parameters specified in its constructor. The input to a date encoder is a datetime.datetime object. The output is the concatenation of several sub-encodings, each of which encodes a different aspect of the date. Which sub-encodings are present, and details of those sub-encodings, are specified in the DateEncoder constructor. Each parameter describes one attribute to encode. By default, the attribute is not encoded. season (season of the year; units = day): (int) width of attribute; default radius = 91.5 days (1 season) (tuple) season[0] = width; season[1] = radius dayOfWeek (monday = 0; units = day) (int) width of attribute; default radius = 1 day (tuple) dayOfWeek[0] = width; dayOfWeek[1] = radius weekend (boolean: 0, 1) (int) width of attribute holiday (boolean: 0, 1) (int) width of attribute timeOfday (midnight = 0; units = hour) (int) width of attribute: default radius = 4 hours (tuple) timeOfDay[0] = width; timeOfDay[1] = radius """ ############################################################################ def __init__(self, season=0, dayOfWeek=0, weekend=0, holiday=0, timeOfDay=0, customDays=0, name = ''): self.width = 0 self.description = [] self.name = name # This will contain a list of (name, encoder, offset) tuples for use by # the decode() method self.encoders = [] self.seasonEncoder = None if season != 0: # Ignore leapyear differences -- assume 366 days in a year # Radius = 91.5 days = length of season # Value is number of days since beginning of year (0 - 355) if hasattr(season, "__getitem__"): w = season[0] radius = season[1] else: w = season radius = 91.5 self.seasonEncoder = ScalarEncoder(w = w, minval=0, maxval=366, radius=radius, periodic=True, name="season") self.seasonOffset = self.width self.width += self.seasonEncoder.getWidth() self.description.append(("season", self.seasonOffset)) self.encoders.append(("season", self.seasonEncoder, self.seasonOffset)) self.dayOfWeekEncoder = None if dayOfWeek != 0: # Value is day of week (floating point) # Radius is 1 day if hasattr(dayOfWeek, "__getitem__"): w = dayOfWeek[0] radius = dayOfWeek[1] else: w = dayOfWeek radius = 1 self.dayOfWeekEncoder = ScalarEncoder(w = w, minval=0, maxval=7, radius=radius, periodic=True, name="day of week") self.dayOfWeekOffset = self.width self.width += self.dayOfWeekEncoder.getWidth() self.description.append(("day of week", self.dayOfWeekOffset)) self.encoders.append(("day of week", self.dayOfWeekEncoder, self.dayOfWeekOffset)) self.weekendEncoder = None if weekend != 0: # Binary value. Not sure if this makes sense. Also is somewhat redundant # with dayOfWeek #Append radius if it was not provided if not hasattr(weekend, "__getitem__"): weekend = (weekend,1) self.weekendEncoder = ScalarEncoder(w = weekend[0], minval = 0, maxval=1, periodic=False, radius=weekend[1], name="weekend") self.weekendOffset = self.width self.width += self.weekendEncoder.getWidth() self.description.append(("weekend", self.weekendOffset)) self.encoders.append(("weekend", self.weekendEncoder, self.weekendOffset)) #Set up custom days encoder, first argument in tuple is width #second is either a single day of the week or a list of the days #you want encoded as ones. self.customDaysEncoder = None if customDays !=0: customDayEncoderName = "" daysToParse = [] assert len(customDays)==2, "Please provide a w and the desired days" if isinstance(customDays[1], list): for day in customDays[1]: customDayEncoderName+=str(day)+" " daysToParse=customDays[1] elif isinstance(customDays[1], str): customDayEncoderName+=customDays[1] daysToParse = [customDays[1]] else: assert False, "You must provide either a list of days or a single day" #Parse days self.customDays = [] for day in daysToParse: if(day.lower() in ["mon","monday"]): self.customDays+=[0] elif day.lower() in ["tue","tuesday"]: self.customDays+=[1] elif day.lower() in ["wed","wednesday"]: self.customDays+=[2] elif day.lower() in ["thu","thursday"]: self.customDays+=[3] elif day.lower() in ["fri","friday"]: self.customDays+=[4] elif day.lower() in ["sat","saturday"]: self.customDays+=[5] elif day.lower() in ["sun","sunday"]: self.customDays+=[6] else: assert False, "Unable to understand %s as a day of week" % str(day) self.customDaysEncoder = ScalarEncoder(w=customDays[0], minval = 0, maxval=1, periodic=False, radius=1, name=customDayEncoderName) self.customDaysOffset = self.width self.width += self.customDaysEncoder.getWidth() self.description.append(("customdays", self.customDaysOffset)) self.encoders.append(("customdays", self.customDaysEncoder, self.customDaysOffset)) self.holidayEncoder = None if holiday != 0: # A "continuous" binary value. = 1 on the holiday itself and smooth ramp # 0->1 on the day before the holiday and 1->0 on the day after the holiday. self.holidayEncoder = ScalarEncoder(w = holiday, minval = 0, maxval=1, periodic=False, radius=1, name="holiday") self.holidayOffset = self.width self.width += self.holidayEncoder.getWidth() self.description.append(("holiday", self.holidayOffset)) self.encoders.append(("holiday", self.holidayEncoder, self.holidayOffset)) self.timeOfDayEncoder = None if timeOfDay != 0: # Value is time of day in hours # Radius = 4 hours, e.g. morning, afternoon, evening, early night, # late night, etc. if hasattr(timeOfDay, "__getitem__"): w = timeOfDay[0] radius = timeOfDay[1] else: w = timeOfDay radius = 4 self.timeOfDayEncoder = ScalarEncoder(w = w, minval=0, maxval=24, periodic=True, radius=radius, name="time of day") self.timeOfDayOffset = self.width self.width += self.timeOfDayEncoder.getWidth() self.description.append(("time of day", self.timeOfDayOffset)) self.encoders.append(("time of day", self.timeOfDayEncoder, self.timeOfDayOffset)) ############################################################################ def getWidth(self): return self.width ############################################################################ def getScalarNames(self, parentFieldName=''): """ See method description in base.py """ names = [] # This forms a name which is the concatenation of the parentFieldName # passed in and the encoder's own name. def _formFieldName(encoder): if parentFieldName == '': return encoder.name else: return '%s.%s' % (parentFieldName, encoder.name) # ------------------------------------------------------------------------- # Get the scalar values for each sub-field if self.seasonEncoder is not None: names.append(_formFieldName(self.seasonEncoder)) if self.dayOfWeekEncoder is not None: names.append(_formFieldName(self.dayOfWeekEncoder)) if self.customDaysEncoder is not None: names.append(_formFieldName(self.customDaysEncoder)) if self.weekendEncoder is not None: names.append(_formFieldName(self.weekendEncoder)) if self.holidayEncoder is not None: names.append(_formFieldName(self.holidayEncoder)) if self.timeOfDayEncoder is not None: names.append(_formFieldName(self.timeOfDayEncoder)) return names ############################################################################ def getEncodedValues(self, input): """ See method description in base.py """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: return numpy.array([None]) assert isinstance(input, datetime.datetime) values = [] # ------------------------------------------------------------------------- # Get the scalar values for each sub-field timetuple = input.timetuple() timeOfDay = timetuple.tm_hour + float(timetuple.tm_min)/60.0 if self.seasonEncoder is not None: dayOfYear = timetuple.tm_yday # input.timetuple() computes the day of year 1 based, so convert to 0 based values.append(dayOfYear-1) if self.dayOfWeekEncoder is not None: dayOfWeek = timetuple.tm_wday #+ timeOfDay / 24.0 values.append(dayOfWeek) if self.weekendEncoder is not None: # saturday, sunday or friday evening if timetuple.tm_wday == 6 or timetuple.tm_wday == 5 \ or (timetuple.tm_wday == 4 and timeOfDay > 18): weekend = 1 else: weekend = 0 values.append(weekend) if self.customDaysEncoder is not None: if timetuple.tm_wday in self.customDays: customDay = 1 else: customDay = 0 values.append(customDay) if self.holidayEncoder is not None: # A "continuous" binary value. = 1 on the holiday itself and smooth ramp # 0->1 on the day before the holiday and 1->0 on the day after the holiday. # Currently the only holiday we know about is December 25 # holidays is a list of holidays that occur on a fixed date every year holidays = [(12, 25)] val = 0 for h in holidays: # hdate is midnight on the holiday hdate = datetime.datetime(timetuple.tm_year, h[0], h[1], 0, 0, 0) if input > hdate: diff = input - hdate if diff.days == 0: # return 1 on the holiday itself val = 1 break elif diff.days == 1: # ramp smoothly from 1 -> 0 on the next day val = 1.0 - (float(diff.seconds) / (86400)) break else: diff = hdate - input if diff.days == 0: # ramp smoothly from 0 -> 1 on the previous day val = 1.0 - (float(diff.seconds) / 86400) values.append(val) if self.timeOfDayEncoder is not None: values.append(timeOfDay) return values ############################################################################ def getScalars(self, input): """ See method description in base.py Parameters: ----------------------------------------------------------------------- input: A datetime object representing the time being encoded Returns: A numpy array of the corresponding scalar values in the following order: [season, dayOfWeek, weekend, holiday, timeOfDay] Note: some of these fields might be omitted if they were not specified in the encoder """ return numpy.array(self.getEncodedValues(input)) ############################################################################ def getBucketIndices(self, input): """ See method description in base.py """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: # Encoder each sub-field return [None] * len(self.encoders) else: assert isinstance(input, datetime.datetime) # Get the scalar values for each sub-field scalars = self.getScalars(input) # Encoder each sub-field result = [] for i in xrange(len(self.encoders)): (name, encoder, offset) = self.encoders[i] result.extend(encoder.getBucketIndices(scalars[i])) return result ############################################################################ def encodeIntoArray(self, input, output): """ See method description in base.py """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: output[0:] = 0 else: if not isinstance(input, datetime.datetime): raise ValueError("Input is type %s, expected datetime. Value: %s" % ( type(input), str(input))) # Get the scalar values for each sub-field scalars = self.getScalars(input) # Encoder each sub-field for i in xrange(len(self.encoders)): (name, encoder, offset) = self.encoders[i] encoder.encodeIntoArray(scalars[i], output[offset:]) ############################################################################ def getDescription(self): return self.description
class CategoryEncoder(Encoder): """Encodes a list of discrete categories (described by strings), that aren't related to each other, so we never emit a mixture of categories. The value of zero is reserved for "unknown category" Internally we use a ScalarEncoder with a radius of 1, but since we only encode integers, we never get mixture outputs. The SDRCategoryEncoder uses a different method to encode categories""" ############################################################################ def __init__(self, w, categoryList, name="category", verbosity=0): self.encoders = None self.verbosity = verbosity # number of categories includes "unknown" self.ncategories = len(categoryList) + 1 self.categoryToIndex = dict() self.indexToCategory = dict() self.indexToCategory[0] = "<UNKNOWN>" for i in xrange(len(categoryList)): self.categoryToIndex[categoryList[i]] = i + 1 self.indexToCategory[i + 1] = categoryList[i] self.encoder = ScalarEncoder(w, minval=0, maxval=self.ncategories - 1, radius=1, periodic=False) self.width = w * self.ncategories assert self.encoder.getWidth() == self.width self.description = [(name, 0)] self.name = name # These are used to support the topDownCompute method self._topDownMappingM = None # This gets filled in by getBucketValues self._bucketValues = None ############################################################################ def getDecoderOutputFieldTypes(self): """ [Encoder class virtual method override] """ # TODO: change back to string meta-type after the decoding logic is fixed # to output strings instead of internal index values. #return (FieldMetaType.string,) return (FieldMetaType.integer, ) ############################################################################ def getWidth(self): return self.width ############################################################################ def getDescription(self): return self.description ############################################################################ def getScalars(self, input): """ See method description in base.py """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: return numpy.array([None]) else: return numpy.array([self.categoryToIndex.get(input, 0)]) ############################################################################ def getBucketIndices(self, input): """ See method description in base.py """ # Get the bucket index from the underlying scalar encoder if input == SENTINEL_VALUE_FOR_MISSING_DATA: return [None] else: return self.encoder.getBucketIndices( self.categoryToIndex.get(input, 0)) ############################################################################ def encodeIntoArray(self, input, output): # if not found, we encode category 0 if input == SENTINEL_VALUE_FOR_MISSING_DATA: output[0:] = 0 val = "<missing>" else: val = self.categoryToIndex.get(input, 0) self.encoder.encodeIntoArray(val, output) if self.verbosity >= 2: print "input:", input, "va:", val, "output:", output print "decoded:", self.decodedToStr(self.decode(output)) ############################################################################ def decode(self, encoded, parentFieldName=''): """ See the function description in base.py """ # Get the scalar values from the underlying scalar encoder (fieldsDict, fieldNames) = self.encoder.decode(encoded) if len(fieldsDict) == 0: return (fieldsDict, fieldNames) # Expect only 1 field assert (len(fieldsDict) == 1) # Get the list of categories the scalar values correspond to and # generate the description from the category name(s). (inRanges, inDesc) = fieldsDict.values()[0] outRanges = [] desc = "" for (minV, maxV) in inRanges: minV = int(round(minV)) maxV = int(round(maxV)) outRanges.append((minV, maxV)) while minV <= maxV: if len(desc) > 0: desc += ", " desc += self.indexToCategory[minV] minV += 1 # Return result if parentFieldName != '': fieldName = "%s.%s" % (parentFieldName, self.name) else: fieldName = self.name return ({fieldName: (outRanges, desc)}, [fieldName]) ############################################################################ def closenessScores( self, expValues, actValues, fractional=True, ): """ See the function description in base.py kwargs will have the keyword "fractional", which is ignored by this encoder """ expValue = expValues[0] actValue = actValues[0] if expValue == actValue: closeness = 1.0 else: closeness = 0.0 if not fractional: closeness = 1.0 - closeness #print "category::", "expValue:", expValue, "actValue:", actValue, \ # "closeness", closeness #import pdb; pdb.set_trace() return numpy.array([closeness]) ############################################################################ def getBucketValues(self): """ See the function description in base.py """ if self._bucketValues is None: numBuckets = len(self.encoder.getBucketValues()) self._bucketValues = [] for bucketIndex in range(numBuckets): self._bucketValues.append( self.getBucketInfo([bucketIndex])[0].value) return self._bucketValues ############################################################################ def getBucketInfo(self, buckets): """ See the function description in base.py """ # For the category encoder, the bucket index is the category index bucketInfo = self.encoder.getBucketInfo(buckets)[0] categoryIndex = int(round(bucketInfo.value)) category = self.indexToCategory[categoryIndex] return [ EncoderResult(value=category, scalar=categoryIndex, encoding=bucketInfo.encoding) ] ############################################################################ def topDownCompute(self, encoded): """ See the function description in base.py """ encoderResult = self.encoder.topDownCompute(encoded)[0] value = encoderResult.value categoryIndex = int(round(value)) category = self.indexToCategory[categoryIndex] return EncoderResult(value=category, scalar=categoryIndex, encoding=encoderResult.encoding)
class LogEncoder(Encoder): """A Log encoder represents a floating point value on a logarithmic (decibel) scale. valueToEncode = 10 * log10(input) The default resolution (minimum difference in scaled values which is guaranteed to propduce different outputs) is 1 decibel. For example, the scaled values 10 and 11 will be distinguishable in the output. In terms of the original input values, this means 10^1 (10) and 10^1.1 (12.5) will be distinguishable. resolution -- encoder resolution, in terms of scaled values. Default: 1 decibel minval -- must be greater than 0. Lower values are reset to this value maxval -- Higher values are reset to this value """ def __init__(self, w=5, resolution=1.0, minval=0.10, maxval=10000, name="log", verbosity=0): self.encoders = None self.verbosity = verbosity self.minScaledValue = int(10 * math.log10(minval)) self.maxScaledValue = int(math.ceil(10 * math.log10(maxval))) assert self.maxScaledValue > self.minScaledValue self.minval = 10**(self.minScaledValue / 10.0) self.maxval = 10**(self.maxScaledValue / 10.0) # Note: passing resolution=1 causes the test to topDownCompute # test to fail. Fixed for now by always converting to float, # but should find the root cause. self.encoder = ScalarEncoder(w=w, minval=self.minScaledValue, maxval=self.maxScaledValue, periodic=False, resolution=float(resolution)) self.width = self.encoder.getWidth() self.description = [(name, 0)] self.name = name # This list is created by getBucketValues() the first time it is called, # and re-created whenever our buckets would be re-arranged. self._bucketValues = None ############################################################################ def getWidth(self): return self.width ############################################################################ def getDescription(self): return self.description ############################################################################ def _getScaledValue(self, input): """ Convert the input, which is in normal space, into log space """ if input == SENTINEL_VALUE_FOR_MISSING_DATA: return None else: val = input if val < self.minval: val = self.minval elif val > self.maxval: val = self.maxval scaledVal = 10 * math.log10(val) return scaledVal ############################################################################ def getBucketIndices(self, input): """ See the function description in base.py """ # Get the scaled value scaledVal = self._getScaledValue(input) if scaledVal is None: return [None] else: return self.encoder.getBucketIndices(scaledVal) ############################################################################ def encodeIntoArray(self, input, output): """ See the function description in base.py """ # Get the scaled value scaledVal = self._getScaledValue(input) if scaledVal is None: output[0:] = 0 else: self.encoder.encodeIntoArray(scaledVal, output) if self.verbosity >= 2: print "input:", input, "scaledVal:", scaledVal, "output:", output print "decoded:", self.decodedToStr(self.decode(output)) ############################################################################ def decode(self, encoded, parentFieldName=''): """ See the function description in base.py """ # Get the scalar values from the underlying scalar encoder (fieldsDict, fieldNames) = self.encoder.decode(encoded) if len(fieldsDict) == 0: return (fieldsDict, fieldNames) # Expect only 1 field assert (len(fieldsDict) == 1) # Convert each range into normal space (inRanges, inDesc) = fieldsDict.values()[0] outRanges = [] for (minV, maxV) in inRanges: outRanges.append( (math.pow(10, minV / 10.0), math.pow(10, maxV / 10.0))) # Generate a text description of the ranges desc = "" numRanges = len(outRanges) for i in xrange(numRanges): if outRanges[i][0] != outRanges[i][1]: desc += "%.2f-%.2f" % (outRanges[i][0], outRanges[i][1]) else: desc += "%.2f" % (outRanges[i][0]) if i < numRanges - 1: desc += ", " # Return result if parentFieldName != '': fieldName = "%s.%s" % (parentFieldName, self.name) else: fieldName = self.name return ({fieldName: (outRanges, desc)}, [fieldName]) ############################################################################ def getBucketValues(self): """ See the function description in base.py """ # Need to re-create? if self._bucketValues is None: scaledValues = self.encoder.getBucketValues() self._bucketValues = [] for scaledValue in scaledValues: value = math.pow(10, scaledValue / 10.0) self._bucketValues.append(value) return self._bucketValues ############################################################################ def getBucketInfo(self, buckets): """ See the function description in base.py """ scaledResult = self.encoder.getBucketInfo(buckets)[0] scaledValue = scaledResult.value value = math.pow(10, scaledValue / 10.0) return [ EncoderResult(value=value, scalar=value, encoding=scaledResult.encoding) ] ############################################################################ def topDownCompute(self, encoded): """ See the function description in base.py """ scaledResult = self.encoder.topDownCompute(encoded)[0] scaledValue = scaledResult.value value = math.pow(10, scaledValue / 10.0) return EncoderResult(value=value, scalar=value, encoding=scaledResult.encoding) ############################################################################ def closenessScores(self, expValues, actValues, fractional=True): """ See the function description in base.py """ # Compute the percent error in log space if expValues[0] > 0: expValue = 10 * math.log10(expValues[0]) else: expValue = self.minScaledValue if actValues[0] > 0: actValue = 10 * math.log10(actValues[0]) else: actValue = self.minScaledValue if fractional: err = abs(expValue - actValue) pctErr = err / (self.maxScaledValue - self.minScaledValue) pctErr = min(1.0, pctErr) closeness = 1.0 - pctErr else: err = abs(expValue - actValue) closeness = err #print "log::", "expValue:", expValues[0], "actValue:", actValues[0], \ # "closeness", closeness #import pdb; pdb.set_trace() return numpy.array([closeness])