Esempio n. 1
0
    def __init__(self, w, categoryList, name="category", verbosity=0):

        self.encoders = None
        self.verbosity = verbosity

        # number of categories includes "unknown"
        self.ncategories = len(categoryList) + 1

        self.categoryToIndex = dict()
        self.indexToCategory = dict()
        self.indexToCategory[0] = "<UNKNOWN>"
        for i in xrange(len(categoryList)):
            self.categoryToIndex[categoryList[i]] = i + 1
            self.indexToCategory[i + 1] = categoryList[i]

        self.encoder = ScalarEncoder(w,
                                     minval=0,
                                     maxval=self.ncategories - 1,
                                     radius=1,
                                     periodic=False)
        self.width = w * self.ncategories
        assert self.encoder.getWidth() == self.width

        self.description = [(name, 0)]
        self.name = name

        # These are used to support the topDownCompute method
        self._topDownMappingM = None

        # This gets filled in by getBucketValues
        self._bucketValues = None
Esempio n. 2
0
    def __init__(self,
                 w=5,
                 resolution=1.0,
                 minval=0.10,
                 maxval=10000,
                 name="log",
                 verbosity=0):

        self.encoders = None
        self.verbosity = verbosity
        self.minScaledValue = int(10 * math.log10(minval))
        self.maxScaledValue = int(math.ceil(10 * math.log10(maxval)))
        assert self.maxScaledValue > self.minScaledValue

        self.minval = 10**(self.minScaledValue / 10.0)
        self.maxval = 10**(self.maxScaledValue / 10.0)

        # Note: passing resolution=1 causes the test to topDownCompute
        # test to fail.  Fixed for now by always converting to float,
        # but should find the root cause.
        self.encoder = ScalarEncoder(w=w,
                                     minval=self.minScaledValue,
                                     maxval=self.maxScaledValue,
                                     periodic=False,
                                     resolution=float(resolution))
        self.width = self.encoder.getWidth()
        self.description = [(name, 0)]
        self.name = name

        # This list is created by getBucketValues() the first time it is called,
        #  and re-created whenever our buckets would be re-arranged.
        self._bucketValues = None
Esempio n. 3
0
  def __init__(self, w, categoryList, name="category", verbosity=0):

    self.encoders = None
    self.verbosity = verbosity

    # number of categories includes "unknown"
    self.ncategories = len(categoryList) + 1

    self.categoryToIndex = dict()
    self.indexToCategory = dict()
    self.indexToCategory[0] = "<UNKNOWN>"
    for i in xrange(len(categoryList)):
      self.categoryToIndex[categoryList[i]] = i+1
      self.indexToCategory[i+1] = categoryList[i]

    self.encoder = ScalarEncoder(w, minval=0, maxval=self.ncategories - 1,
                      radius=1, periodic=False)
    self.width = w * self.ncategories
    assert self.encoder.getWidth() == self.width

    self.description = [(name, 0)]
    self.name = name

    # These are used to support the topDownCompute method
    self._topDownMappingM = None

    # This gets filled in by getBucketValues
    self._bucketValues = None
Esempio n. 4
0
  def __init__(self, w = 5, resolution = 1.0, minval=0.10, maxval=10000,
                name="log", verbosity=0):

    self.encoders = None
    self.verbosity = verbosity
    self.minScaledValue = int(10 * math.log10(minval))
    self.maxScaledValue = int(math.ceil(10 * math.log10(maxval)))
    assert self.maxScaledValue > self.minScaledValue

    self.minval = 10 ** (self.minScaledValue / 10.0)
    self.maxval = 10 ** (self.maxScaledValue / 10.0)

    # Note: passing resolution=1 causes the test to topDownCompute
    # test to fail.  Fixed for now by always converting to float,
    # but should find the root cause.
    self.encoder = ScalarEncoder(w=w, minval = self.minScaledValue,
                    maxval=self.maxScaledValue,
                    periodic=False,
                    resolution=float(resolution))
    self.width = self.encoder.getWidth()
    self.description = [(name, 0)]
    self.name = name

    # This list is created by getBucketValues() the first time it is called,
    #  and re-created whenever our buckets would be re-arranged.
    self._bucketValues = None
Esempio n. 5
0
class CategoryEncoder(Encoder):
  """Encodes a list of discrete categories (described by strings), that aren't
  related to each other, so we never emit a mixture of categories.

  The value of zero is reserved for "unknown category"

  Internally we use a ScalarEncoder with a radius of 1, but since we only encode
  integers, we never get mixture outputs.

  The SDRCategoryEncoder uses a different method to encode categories"""

  ############################################################################
  def __init__(self, w, categoryList, name="category", verbosity=0):

    self.encoders = None
    self.verbosity = verbosity

    # number of categories includes "unknown"
    self.ncategories = len(categoryList) + 1

    self.categoryToIndex = dict()
    self.indexToCategory = dict()
    self.indexToCategory[0] = "<UNKNOWN>"
    for i in xrange(len(categoryList)):
      self.categoryToIndex[categoryList[i]] = i+1
      self.indexToCategory[i+1] = categoryList[i]

    self.encoder = ScalarEncoder(w, minval=0, maxval=self.ncategories - 1,
                      radius=1, periodic=False)
    self.width = w * self.ncategories
    assert self.encoder.getWidth() == self.width

    self.description = [(name, 0)]
    self.name = name

    # These are used to support the topDownCompute method
    self._topDownMappingM = None

    # This gets filled in by getBucketValues
    self._bucketValues = None


  ############################################################################
  def getDecoderOutputFieldTypes(self):
    """ [Encoder class virtual method override]
    """
    # TODO: change back to string meta-type after the decoding logic is fixed
    #       to output strings instead of internal index values.
    #return (FieldMetaType.string,)
    return (FieldMetaType.integer,)


  ############################################################################
  def getWidth(self):
    return self.width

  ############################################################################
  def getDescription(self):
    return self.description

  ############################################################################
  def getScalars(self, input):
    """ See method description in base.py """
    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
      return numpy.array([None])
    else:
      return numpy.array([self.categoryToIndex.get(input, 0)])


  ############################################################################
  def getBucketIndices(self, input):
    """ See method description in base.py """

    # Get the bucket index from the underlying scalar encoder
    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
      return [None]
    else:
      return self.encoder.getBucketIndices(self.categoryToIndex.get(input, 0))



  ############################################################################
  def encodeIntoArray(self, input, output):
    # if not found, we encode category 0
    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
      output[0:] = 0
      val = "<missing>"
    else:
      val = self.categoryToIndex.get(input, 0)
      self.encoder.encodeIntoArray(val, output)

    if self.verbosity >= 2:
      print "input:", input, "va:", val, "output:", output
      print "decoded:", self.decodedToStr(self.decode(output))


  ############################################################################
  def decode(self, encoded, parentFieldName=''):
    """ See the function description in base.py
    """

    # Get the scalar values from the underlying scalar encoder
    (fieldsDict, fieldNames) = self.encoder.decode(encoded)
    if len(fieldsDict) == 0:
      return (fieldsDict, fieldNames)

    # Expect only 1 field
    assert(len(fieldsDict) == 1)

    # Get the list of categories the scalar values correspond to and
    #  generate the description from the category name(s).
    (inRanges, inDesc) = fieldsDict.values()[0]
    outRanges = []
    desc = ""
    for (minV, maxV) in inRanges:
      minV = int(round(minV))
      maxV = int(round(maxV))
      outRanges.append((minV, maxV))
      while minV <= maxV:
        if len(desc) > 0:
          desc += ", "
        desc += self.indexToCategory[minV]
        minV += 1

    # Return result
    if parentFieldName != '':
      fieldName = "%s.%s" % (parentFieldName, self.name)
    else:
      fieldName = self.name
    return ({fieldName: (outRanges, desc)}, [fieldName])


  ############################################################################
  def closenessScores(self, expValues, actValues, fractional=True,):
    """ See the function description in base.py

    kwargs will have the keyword "fractional", which is ignored by this encoder
    """

    expValue = expValues[0]
    actValue = actValues[0]

    if expValue == actValue:
      closeness = 1.0
    else:
      closeness = 0.0

    if not fractional:
      closeness = 1.0 - closeness

    #print "category::", "expValue:", expValue, "actValue:", actValue, \
    #      "closeness", closeness
    #import pdb; pdb.set_trace()

    return numpy.array([closeness])



  ############################################################################
  def getBucketValues(self):
    """ See the function description in base.py """

    if self._bucketValues is None:
      numBuckets = len(self.encoder.getBucketValues())
      self._bucketValues = []
      for bucketIndex in range(numBuckets):
        self._bucketValues.append(self.getBucketInfo([bucketIndex])[0].value)

    return self._bucketValues

  ############################################################################
  def getBucketInfo(self, buckets):
    """ See the function description in base.py
    """

    # For the category encoder, the bucket index is the category index
    bucketInfo = self.encoder.getBucketInfo(buckets)[0]

    categoryIndex = int(round(bucketInfo.value))
    category = self.indexToCategory[categoryIndex]

    return [EncoderResult(value=category, scalar=categoryIndex,
                         encoding=bucketInfo.encoding)]



  ############################################################################
  def topDownCompute(self, encoded):
    """ See the function description in base.py
    """

    encoderResult = self.encoder.topDownCompute(encoded)[0]
    value = encoderResult.value
    categoryIndex = int(round(value))
    category = self.indexToCategory[categoryIndex]

    return EncoderResult(value=category, scalar=categoryIndex,
                         encoding=encoderResult.encoding)
Esempio n. 6
0
    def __init__(self,
                 season=0,
                 dayOfWeek=0,
                 weekend=0,
                 holiday=0,
                 timeOfDay=0,
                 customDays=0,
                 name=''):

        self.width = 0
        self.description = []
        self.name = name

        # This will contain a list of (name, encoder, offset) tuples for use by
        #  the decode() method
        self.encoders = []

        self.seasonEncoder = None
        if season != 0:
            # Ignore leapyear differences -- assume 366 days in a year
            # Radius = 91.5 days = length of season
            # Value is number of days since beginning of year (0 - 355)
            if hasattr(season, "__getitem__"):
                w = season[0]
                radius = season[1]
            else:
                w = season
                radius = 91.5

            self.seasonEncoder = ScalarEncoder(w=w,
                                               minval=0,
                                               maxval=366,
                                               radius=radius,
                                               periodic=True,
                                               name="season")
            self.seasonOffset = self.width
            self.width += self.seasonEncoder.getWidth()
            self.description.append(("season", self.seasonOffset))
            self.encoders.append(
                ("season", self.seasonEncoder, self.seasonOffset))

        self.dayOfWeekEncoder = None
        if dayOfWeek != 0:
            # Value is day of week (floating point)
            # Radius is 1 day
            if hasattr(dayOfWeek, "__getitem__"):
                w = dayOfWeek[0]
                radius = dayOfWeek[1]
            else:
                w = dayOfWeek
                radius = 1
            self.dayOfWeekEncoder = ScalarEncoder(w=w,
                                                  minval=0,
                                                  maxval=7,
                                                  radius=radius,
                                                  periodic=True,
                                                  name="day of week")
            self.dayOfWeekOffset = self.width
            self.width += self.dayOfWeekEncoder.getWidth()
            self.description.append(("day of week", self.dayOfWeekOffset))
            self.encoders.append(
                ("day of week", self.dayOfWeekEncoder, self.dayOfWeekOffset))

        self.weekendEncoder = None
        if weekend != 0:
            # Binary value. Not sure if this makes sense. Also is somewhat redundant
            #  with dayOfWeek
            #Append radius if it was not provided
            if not hasattr(weekend, "__getitem__"):
                weekend = (weekend, 1)
            self.weekendEncoder = ScalarEncoder(w=weekend[0],
                                                minval=0,
                                                maxval=1,
                                                periodic=False,
                                                radius=weekend[1],
                                                name="weekend")
            self.weekendOffset = self.width
            self.width += self.weekendEncoder.getWidth()
            self.description.append(("weekend", self.weekendOffset))
            self.encoders.append(
                ("weekend", self.weekendEncoder, self.weekendOffset))

        #Set up custom days encoder, first argument in tuple is width
        #second is either a single day of the week or a list of the days
        #you want encoded as ones.
        self.customDaysEncoder = None
        if customDays != 0:
            customDayEncoderName = ""
            daysToParse = []
            assert len(
                customDays) == 2, "Please provide a w and the desired days"
            if isinstance(customDays[1], list):
                for day in customDays[1]:
                    customDayEncoderName += str(day) + " "
                daysToParse = customDays[1]
            elif isinstance(customDays[1], str):
                customDayEncoderName += customDays[1]
                daysToParse = [customDays[1]]
            else:
                assert False, "You must provide either a list of days or a single day"
            #Parse days
            self.customDays = []
            for day in daysToParse:
                if (day.lower() in ["mon", "monday"]):
                    self.customDays += [0]
                elif day.lower() in ["tue", "tuesday"]:
                    self.customDays += [1]
                elif day.lower() in ["wed", "wednesday"]:
                    self.customDays += [2]
                elif day.lower() in ["thu", "thursday"]:
                    self.customDays += [3]
                elif day.lower() in ["fri", "friday"]:
                    self.customDays += [4]
                elif day.lower() in ["sat", "saturday"]:
                    self.customDays += [5]
                elif day.lower() in ["sun", "sunday"]:
                    self.customDays += [6]
                else:
                    assert False, "Unable to understand %s as a day of week" % str(
                        day)
            self.customDaysEncoder = ScalarEncoder(w=customDays[0],
                                                   minval=0,
                                                   maxval=1,
                                                   periodic=False,
                                                   radius=1,
                                                   name=customDayEncoderName)
            self.customDaysOffset = self.width
            self.width += self.customDaysEncoder.getWidth()
            self.description.append(("customdays", self.customDaysOffset))
            self.encoders.append(
                ("customdays", self.customDaysEncoder, self.customDaysOffset))

        self.holidayEncoder = None
        if holiday != 0:
            # A "continuous" binary value. = 1 on the holiday itself and smooth ramp
            #  0->1 on the day before the holiday and 1->0 on the day after the holiday.
            self.holidayEncoder = ScalarEncoder(w=holiday,
                                                minval=0,
                                                maxval=1,
                                                periodic=False,
                                                radius=1,
                                                name="holiday")
            self.holidayOffset = self.width
            self.width += self.holidayEncoder.getWidth()
            self.description.append(("holiday", self.holidayOffset))
            self.encoders.append(
                ("holiday", self.holidayEncoder, self.holidayOffset))

        self.timeOfDayEncoder = None
        if timeOfDay != 0:
            # Value is time of day in hours
            # Radius = 4 hours, e.g. morning, afternoon, evening, early night,
            #  late night, etc.
            if hasattr(timeOfDay, "__getitem__"):
                w = timeOfDay[0]
                radius = timeOfDay[1]
            else:
                w = timeOfDay
                radius = 4
            self.timeOfDayEncoder = ScalarEncoder(w=w,
                                                  minval=0,
                                                  maxval=24,
                                                  periodic=True,
                                                  radius=radius,
                                                  name="time of day")
            self.timeOfDayOffset = self.width
            self.width += self.timeOfDayEncoder.getWidth()
            self.description.append(("time of day", self.timeOfDayOffset))
            self.encoders.append(
                ("time of day", self.timeOfDayEncoder, self.timeOfDayOffset))
Esempio n. 7
0
class DateEncoder(Encoder):
    """A date encoder encodes a date according to encoding parameters
  specified in its constructor.
  The input to a date encoder is a datetime.datetime object. The output
  is the concatenation of several sub-encodings, each of which encodes
  a different aspect of the date. Which sub-encodings are present, and
  details of those sub-encodings, are specified in the DateEncoder
  constructor.

  Each parameter describes one attribute to encode. By default, the attribute
  is not encoded.

  season (season of the year; units = day):
    (int) width of attribute; default radius = 91.5 days (1 season)
    (tuple)  season[0] = width; season[1] = radius

  dayOfWeek (monday = 0; units = day)
    (int) width of attribute; default radius = 1 day
    (tuple) dayOfWeek[0] = width; dayOfWeek[1] = radius

  weekend (boolean: 0, 1)
    (int) width of attribute

  holiday (boolean: 0, 1)
    (int) width of attribute

  timeOfday (midnight = 0; units = hour)
    (int) width of attribute: default radius = 4 hours
    (tuple) timeOfDay[0] = width; timeOfDay[1] = radius





  """

    ############################################################################
    def __init__(self,
                 season=0,
                 dayOfWeek=0,
                 weekend=0,
                 holiday=0,
                 timeOfDay=0,
                 customDays=0,
                 name=''):

        self.width = 0
        self.description = []
        self.name = name

        # This will contain a list of (name, encoder, offset) tuples for use by
        #  the decode() method
        self.encoders = []

        self.seasonEncoder = None
        if season != 0:
            # Ignore leapyear differences -- assume 366 days in a year
            # Radius = 91.5 days = length of season
            # Value is number of days since beginning of year (0 - 355)
            if hasattr(season, "__getitem__"):
                w = season[0]
                radius = season[1]
            else:
                w = season
                radius = 91.5

            self.seasonEncoder = ScalarEncoder(w=w,
                                               minval=0,
                                               maxval=366,
                                               radius=radius,
                                               periodic=True,
                                               name="season")
            self.seasonOffset = self.width
            self.width += self.seasonEncoder.getWidth()
            self.description.append(("season", self.seasonOffset))
            self.encoders.append(
                ("season", self.seasonEncoder, self.seasonOffset))

        self.dayOfWeekEncoder = None
        if dayOfWeek != 0:
            # Value is day of week (floating point)
            # Radius is 1 day
            if hasattr(dayOfWeek, "__getitem__"):
                w = dayOfWeek[0]
                radius = dayOfWeek[1]
            else:
                w = dayOfWeek
                radius = 1
            self.dayOfWeekEncoder = ScalarEncoder(w=w,
                                                  minval=0,
                                                  maxval=7,
                                                  radius=radius,
                                                  periodic=True,
                                                  name="day of week")
            self.dayOfWeekOffset = self.width
            self.width += self.dayOfWeekEncoder.getWidth()
            self.description.append(("day of week", self.dayOfWeekOffset))
            self.encoders.append(
                ("day of week", self.dayOfWeekEncoder, self.dayOfWeekOffset))

        self.weekendEncoder = None
        if weekend != 0:
            # Binary value. Not sure if this makes sense. Also is somewhat redundant
            #  with dayOfWeek
            #Append radius if it was not provided
            if not hasattr(weekend, "__getitem__"):
                weekend = (weekend, 1)
            self.weekendEncoder = ScalarEncoder(w=weekend[0],
                                                minval=0,
                                                maxval=1,
                                                periodic=False,
                                                radius=weekend[1],
                                                name="weekend")
            self.weekendOffset = self.width
            self.width += self.weekendEncoder.getWidth()
            self.description.append(("weekend", self.weekendOffset))
            self.encoders.append(
                ("weekend", self.weekendEncoder, self.weekendOffset))

        #Set up custom days encoder, first argument in tuple is width
        #second is either a single day of the week or a list of the days
        #you want encoded as ones.
        self.customDaysEncoder = None
        if customDays != 0:
            customDayEncoderName = ""
            daysToParse = []
            assert len(
                customDays) == 2, "Please provide a w and the desired days"
            if isinstance(customDays[1], list):
                for day in customDays[1]:
                    customDayEncoderName += str(day) + " "
                daysToParse = customDays[1]
            elif isinstance(customDays[1], str):
                customDayEncoderName += customDays[1]
                daysToParse = [customDays[1]]
            else:
                assert False, "You must provide either a list of days or a single day"
            #Parse days
            self.customDays = []
            for day in daysToParse:
                if (day.lower() in ["mon", "monday"]):
                    self.customDays += [0]
                elif day.lower() in ["tue", "tuesday"]:
                    self.customDays += [1]
                elif day.lower() in ["wed", "wednesday"]:
                    self.customDays += [2]
                elif day.lower() in ["thu", "thursday"]:
                    self.customDays += [3]
                elif day.lower() in ["fri", "friday"]:
                    self.customDays += [4]
                elif day.lower() in ["sat", "saturday"]:
                    self.customDays += [5]
                elif day.lower() in ["sun", "sunday"]:
                    self.customDays += [6]
                else:
                    assert False, "Unable to understand %s as a day of week" % str(
                        day)
            self.customDaysEncoder = ScalarEncoder(w=customDays[0],
                                                   minval=0,
                                                   maxval=1,
                                                   periodic=False,
                                                   radius=1,
                                                   name=customDayEncoderName)
            self.customDaysOffset = self.width
            self.width += self.customDaysEncoder.getWidth()
            self.description.append(("customdays", self.customDaysOffset))
            self.encoders.append(
                ("customdays", self.customDaysEncoder, self.customDaysOffset))

        self.holidayEncoder = None
        if holiday != 0:
            # A "continuous" binary value. = 1 on the holiday itself and smooth ramp
            #  0->1 on the day before the holiday and 1->0 on the day after the holiday.
            self.holidayEncoder = ScalarEncoder(w=holiday,
                                                minval=0,
                                                maxval=1,
                                                periodic=False,
                                                radius=1,
                                                name="holiday")
            self.holidayOffset = self.width
            self.width += self.holidayEncoder.getWidth()
            self.description.append(("holiday", self.holidayOffset))
            self.encoders.append(
                ("holiday", self.holidayEncoder, self.holidayOffset))

        self.timeOfDayEncoder = None
        if timeOfDay != 0:
            # Value is time of day in hours
            # Radius = 4 hours, e.g. morning, afternoon, evening, early night,
            #  late night, etc.
            if hasattr(timeOfDay, "__getitem__"):
                w = timeOfDay[0]
                radius = timeOfDay[1]
            else:
                w = timeOfDay
                radius = 4
            self.timeOfDayEncoder = ScalarEncoder(w=w,
                                                  minval=0,
                                                  maxval=24,
                                                  periodic=True,
                                                  radius=radius,
                                                  name="time of day")
            self.timeOfDayOffset = self.width
            self.width += self.timeOfDayEncoder.getWidth()
            self.description.append(("time of day", self.timeOfDayOffset))
            self.encoders.append(
                ("time of day", self.timeOfDayEncoder, self.timeOfDayOffset))

    ############################################################################
    def getWidth(self):
        return self.width

    ############################################################################
    def getScalarNames(self, parentFieldName=''):
        """ See method description in base.py """

        names = []

        # This forms a name which is the concatenation of the parentFieldName
        #   passed in and the encoder's own name.
        def _formFieldName(encoder):
            if parentFieldName == '':
                return encoder.name
            else:
                return '%s.%s' % (parentFieldName, encoder.name)

        # -------------------------------------------------------------------------
        # Get the scalar values for each sub-field
        if self.seasonEncoder is not None:
            names.append(_formFieldName(self.seasonEncoder))

        if self.dayOfWeekEncoder is not None:
            names.append(_formFieldName(self.dayOfWeekEncoder))

        if self.customDaysEncoder is not None:
            names.append(_formFieldName(self.customDaysEncoder))

        if self.weekendEncoder is not None:
            names.append(_formFieldName(self.weekendEncoder))

        if self.holidayEncoder is not None:
            names.append(_formFieldName(self.holidayEncoder))

        if self.timeOfDayEncoder is not None:
            names.append(_formFieldName(self.timeOfDayEncoder))

        return names

    ############################################################################
    def getEncodedValues(self, input):
        """ See method description in base.py """

        if input == SENTINEL_VALUE_FOR_MISSING_DATA:
            return numpy.array([None])

        assert isinstance(input, datetime.datetime)
        values = []

        # -------------------------------------------------------------------------
        # Get the scalar values for each sub-field
        timetuple = input.timetuple()
        timeOfDay = timetuple.tm_hour + float(timetuple.tm_min) / 60.0

        if self.seasonEncoder is not None:
            dayOfYear = timetuple.tm_yday
            # input.timetuple() computes the day of year 1 based, so convert to 0 based
            values.append(dayOfYear - 1)

        if self.dayOfWeekEncoder is not None:
            dayOfWeek = timetuple.tm_wday  #+ timeOfDay / 24.0
            values.append(dayOfWeek)

        if self.weekendEncoder is not None:
            # saturday, sunday or friday evening
            if timetuple.tm_wday == 6 or timetuple.tm_wday == 5 \
                or (timetuple.tm_wday == 4 and timeOfDay > 18):
                weekend = 1
            else:
                weekend = 0
            values.append(weekend)

        if self.customDaysEncoder is not None:
            if timetuple.tm_wday in self.customDays:
                customDay = 1
            else:
                customDay = 0
            values.append(customDay)
        if self.holidayEncoder is not None:
            # A "continuous" binary value. = 1 on the holiday itself and smooth ramp
            #  0->1 on the day before the holiday and 1->0 on the day after the holiday.
            # Currently the only holiday we know about is December 25
            # holidays is a list of holidays that occur on a fixed date every year
            holidays = [(12, 25)]
            val = 0
            for h in holidays:
                # hdate is midnight on the holiday
                hdate = datetime.datetime(timetuple.tm_year, h[0], h[1], 0, 0,
                                          0)
                if input > hdate:
                    diff = input - hdate
                    if diff.days == 0:
                        # return 1 on the holiday itself
                        val = 1
                        break
                    elif diff.days == 1:
                        # ramp smoothly from 1 -> 0 on the next day
                        val = 1.0 - (float(diff.seconds) / (86400))
                        break
                else:
                    diff = hdate - input
                    if diff.days == 0:
                        # ramp smoothly from 0 -> 1 on the previous day
                        val = 1.0 - (float(diff.seconds) / 86400)

            values.append(val)

        if self.timeOfDayEncoder is not None:
            values.append(timeOfDay)

        return values

    ############################################################################
    def getScalars(self, input):
        """ See method description in base.py

    Parameters:
    -----------------------------------------------------------------------
    input:          A datetime object representing the time being encoded

    Returns:        A numpy array of the corresponding scalar values in
                    the following order:

                    [season, dayOfWeek, weekend, holiday, timeOfDay]

                    Note: some of these fields might be omitted if they were not
                    specified in the encoder
    """
        return numpy.array(self.getEncodedValues(input))

    ############################################################################
    def getBucketIndices(self, input):
        """ See method description in base.py """

        if input == SENTINEL_VALUE_FOR_MISSING_DATA:
            # Encoder each sub-field
            return [None] * len(self.encoders)

        else:
            assert isinstance(input, datetime.datetime)

            # Get the scalar values for each sub-field
            scalars = self.getScalars(input)

            # Encoder each sub-field
            result = []
            for i in xrange(len(self.encoders)):
                (name, encoder, offset) = self.encoders[i]
                result.extend(encoder.getBucketIndices(scalars[i]))
            return result

    ############################################################################
    def encodeIntoArray(self, input, output):
        """ See method description in base.py """

        if input == SENTINEL_VALUE_FOR_MISSING_DATA:
            output[0:] = 0
        else:
            if not isinstance(input, datetime.datetime):
                raise ValueError(
                    "Input is type %s, expected datetime. Value: %s" %
                    (type(input), str(input)))

            # Get the scalar values for each sub-field
            scalars = self.getScalars(input)

            # Encoder each sub-field
            for i in xrange(len(self.encoders)):
                (name, encoder, offset) = self.encoders[i]
                encoder.encodeIntoArray(scalars[i], output[offset:])

    ############################################################################
    def getDescription(self):
        return self.description
Esempio n. 8
0
class LogEncoder(Encoder):
    """A Log encoder represents a floating point value on a logarithmic (decibel)
  scale.

  valueToEncode = 10 * log10(input)

  The default resolution (minimum difference in scaled values which is guaranteed
  to propduce different outputs) is 1 decibel. For example, the scaled values 10
  and 11 will be distinguishable in the output. In terms of the original input
  values, this means 10^1 (10) and 10^1.1 (12.5) will be distinguishable.

    resolution -- encoder resolution, in terms of scaled values. Default: 1 decibel
    minval -- must be greater than 0. Lower values are reset to this value
    maxval -- Higher values are reset to this value
  """

    def __init__(self, w=5, resolution=1.0, minval=0.10, maxval=10000, name="log", verbosity=0):

        self.encoders = None
        self.verbosity = verbosity
        self.minScaledValue = int(10 * math.log10(minval))
        self.maxScaledValue = int(math.ceil(10 * math.log10(maxval)))
        assert self.maxScaledValue > self.minScaledValue

        self.minval = 10 ** (self.minScaledValue / 10.0)
        self.maxval = 10 ** (self.maxScaledValue / 10.0)

        # Note: passing resolution=1 causes the test to topDownCompute
        # test to fail.  Fixed for now by always converting to float,
        # but should find the root cause.
        self.encoder = ScalarEncoder(
            w=w, minval=self.minScaledValue, maxval=self.maxScaledValue, periodic=False, resolution=float(resolution)
        )
        self.width = self.encoder.getWidth()
        self.description = [(name, 0)]
        self.name = name

        # This list is created by getBucketValues() the first time it is called,
        #  and re-created whenever our buckets would be re-arranged.
        self._bucketValues = None

    ############################################################################
    def getWidth(self):
        return self.width

    ############################################################################
    def getDescription(self):
        return self.description

    ############################################################################
    def _getScaledValue(self, input):
        """ Convert the input, which is in normal space, into log space
    """
        if input == SENTINEL_VALUE_FOR_MISSING_DATA:
            return None
        else:
            val = input
            if val < self.minval:
                val = self.minval
            elif val > self.maxval:
                val = self.maxval

            scaledVal = 10 * math.log10(val)
            return scaledVal

    ############################################################################
    def getBucketIndices(self, input):
        """ See the function description in base.py
    """

        # Get the scaled value
        scaledVal = self._getScaledValue(input)

        if scaledVal is None:
            return [None]
        else:
            return self.encoder.getBucketIndices(scaledVal)

    ############################################################################
    def encodeIntoArray(self, input, output):
        """ See the function description in base.py
    """

        # Get the scaled value
        scaledVal = self._getScaledValue(input)

        if scaledVal is None:
            output[0:] = 0
        else:
            self.encoder.encodeIntoArray(scaledVal, output)

            if self.verbosity >= 2:
                print "input:", input, "scaledVal:", scaledVal, "output:", output
                print "decoded:", self.decodedToStr(self.decode(output))

    ############################################################################
    def decode(self, encoded, parentFieldName=""):
        """ See the function description in base.py
    """

        # Get the scalar values from the underlying scalar encoder
        (fieldsDict, fieldNames) = self.encoder.decode(encoded)
        if len(fieldsDict) == 0:
            return (fieldsDict, fieldNames)

        # Expect only 1 field
        assert len(fieldsDict) == 1

        # Convert each range into normal space
        (inRanges, inDesc) = fieldsDict.values()[0]
        outRanges = []
        for (minV, maxV) in inRanges:
            outRanges.append((math.pow(10, minV / 10.0), math.pow(10, maxV / 10.0)))

        # Generate a text description of the ranges
        desc = ""
        numRanges = len(outRanges)
        for i in xrange(numRanges):
            if outRanges[i][0] != outRanges[i][1]:
                desc += "%.2f-%.2f" % (outRanges[i][0], outRanges[i][1])
            else:
                desc += "%.2f" % (outRanges[i][0])
            if i < numRanges - 1:
                desc += ", "

        # Return result
        if parentFieldName != "":
            fieldName = "%s.%s" % (parentFieldName, self.name)
        else:
            fieldName = self.name
        return ({fieldName: (outRanges, desc)}, [fieldName])

    ############################################################################
    def getBucketValues(self):
        """ See the function description in base.py """

        # Need to re-create?
        if self._bucketValues is None:
            scaledValues = self.encoder.getBucketValues()
            self._bucketValues = []
            for scaledValue in scaledValues:
                value = math.pow(10, scaledValue / 10.0)
                self._bucketValues.append(value)

        return self._bucketValues

    ############################################################################
    def getBucketInfo(self, buckets):
        """ See the function description in base.py
    """

        scaledResult = self.encoder.getBucketInfo(buckets)[0]
        scaledValue = scaledResult.value
        value = math.pow(10, scaledValue / 10.0)

        return [EncoderResult(value=value, scalar=value, encoding=scaledResult.encoding)]

    ############################################################################
    def topDownCompute(self, encoded):
        """ See the function description in base.py
    """

        scaledResult = self.encoder.topDownCompute(encoded)[0]
        scaledValue = scaledResult.value
        value = math.pow(10, scaledValue / 10.0)

        return EncoderResult(value=value, scalar=value, encoding=scaledResult.encoding)

    ############################################################################
    def closenessScores(self, expValues, actValues, fractional=True):
        """ See the function description in base.py
    """

        # Compute the percent error in log space
        if expValues[0] > 0:
            expValue = 10 * math.log10(expValues[0])
        else:
            expValue = self.minScaledValue

        if actValues[0] > 0:
            actValue = 10 * math.log10(actValues[0])
        else:
            actValue = self.minScaledValue

        if fractional:
            err = abs(expValue - actValue)
            pctErr = err / (self.maxScaledValue - self.minScaledValue)
            pctErr = min(1.0, pctErr)
            closeness = 1.0 - pctErr
        else:
            err = abs(expValue - actValue)
            closeness = err

        # print "log::", "expValue:", expValues[0], "actValue:", actValues[0], \
        #      "closeness", closeness
        # import pdb; pdb.set_trace()
        return numpy.array([closeness])
Esempio n. 9
0
  def __init__(self, season=0, dayOfWeek=0, weekend=0, holiday=0, timeOfDay=0, customDays=0,
                name = ''):

    self.width = 0
    self.description = []
    self.name = name

    # This will contain a list of (name, encoder, offset) tuples for use by
    #  the decode() method
    self.encoders = []

    self.seasonEncoder = None
    if season != 0:
      # Ignore leapyear differences -- assume 366 days in a year
      # Radius = 91.5 days = length of season
      # Value is number of days since beginning of year (0 - 355)
      if hasattr(season, "__getitem__"):
        w = season[0]
        radius = season[1]
      else:
        w = season
        radius = 91.5

      self.seasonEncoder = ScalarEncoder(w = w, minval=0, maxval=366,
                                         radius=radius, periodic=True,
                                         name="season")
      self.seasonOffset = self.width
      self.width += self.seasonEncoder.getWidth()
      self.description.append(("season", self.seasonOffset))
      self.encoders.append(("season", self.seasonEncoder, self.seasonOffset))


    self.dayOfWeekEncoder = None
    if dayOfWeek != 0:
      # Value is day of week (floating point)
      # Radius is 1 day
      if hasattr(dayOfWeek, "__getitem__"):
        w = dayOfWeek[0]
        radius = dayOfWeek[1]
      else:
        w = dayOfWeek
        radius = 1
      self.dayOfWeekEncoder = ScalarEncoder(w = w, minval=0, maxval=7,
                                            radius=radius, periodic=True,
                                            name="day of week")
      self.dayOfWeekOffset = self.width
      self.width += self.dayOfWeekEncoder.getWidth()
      self.description.append(("day of week", self.dayOfWeekOffset))
      self.encoders.append(("day of week", self.dayOfWeekEncoder, self.dayOfWeekOffset))

    self.weekendEncoder = None
    if weekend != 0:
      # Binary value. Not sure if this makes sense. Also is somewhat redundant
      #  with dayOfWeek
      #Append radius if it was not provided
      if not hasattr(weekend, "__getitem__"):
        weekend = (weekend,1)
      self.weekendEncoder = ScalarEncoder(w = weekend[0], minval = 0, maxval=1,
                                          periodic=False, radius=weekend[1],
                                          name="weekend")
      self.weekendOffset = self.width
      self.width += self.weekendEncoder.getWidth()
      self.description.append(("weekend", self.weekendOffset))
      self.encoders.append(("weekend", self.weekendEncoder, self.weekendOffset))

    #Set up custom days encoder, first argument in tuple is width
    #second is either a single day of the week or a list of the days
    #you want encoded as ones.
    self.customDaysEncoder = None
    if customDays !=0:
      customDayEncoderName = ""
      daysToParse = []
      assert len(customDays)==2, "Please provide a w and the desired days"
      if isinstance(customDays[1], list):
        for day in customDays[1]:
          customDayEncoderName+=str(day)+" "
        daysToParse=customDays[1]
      elif isinstance(customDays[1], str):
        customDayEncoderName+=customDays[1]
        daysToParse = [customDays[1]]
      else:
        assert False, "You must provide either a list of days or a single day"
      #Parse days
      self.customDays = []
      for day in daysToParse:
        if(day.lower() in ["mon","monday"]):
          self.customDays+=[0]
        elif day.lower() in ["tue","tuesday"]:
          self.customDays+=[1]
        elif day.lower() in ["wed","wednesday"]:
          self.customDays+=[2]
        elif day.lower() in ["thu","thursday"]:
          self.customDays+=[3]
        elif day.lower() in ["fri","friday"]:
          self.customDays+=[4]
        elif day.lower() in ["sat","saturday"]:
          self.customDays+=[5]
        elif day.lower() in ["sun","sunday"]:
          self.customDays+=[6]
        else:
          assert False, "Unable to understand %s as a day of week" % str(day)
      self.customDaysEncoder = ScalarEncoder(w=customDays[0], minval = 0, maxval=1,
                                            periodic=False, radius=1,
                                            name=customDayEncoderName)
      self.customDaysOffset = self.width
      self.width += self.customDaysEncoder.getWidth()
      self.description.append(("customdays", self.customDaysOffset))
      self.encoders.append(("customdays", self.customDaysEncoder, self.customDaysOffset))

    self.holidayEncoder = None
    if holiday != 0:
      # A "continuous" binary value. = 1 on the holiday itself and smooth ramp
      #  0->1 on the day before the holiday and 1->0 on the day after the holiday.
      self.holidayEncoder = ScalarEncoder(w = holiday, minval = 0, maxval=1,
                                          periodic=False, radius=1,
                                          name="holiday")
      self.holidayOffset = self.width
      self.width += self.holidayEncoder.getWidth()
      self.description.append(("holiday", self.holidayOffset))
      self.encoders.append(("holiday", self.holidayEncoder, self.holidayOffset))

    self.timeOfDayEncoder = None
    if timeOfDay != 0:
      # Value is time of day in hours
      # Radius = 4 hours, e.g. morning, afternoon, evening, early night,
      #  late night, etc.
      if hasattr(timeOfDay, "__getitem__"):
        w = timeOfDay[0]
        radius = timeOfDay[1]
      else:
        w = timeOfDay
        radius = 4
      self.timeOfDayEncoder = ScalarEncoder(w = w, minval=0, maxval=24,
                              periodic=True, radius=radius, name="time of day")
      self.timeOfDayOffset = self.width
      self.width += self.timeOfDayEncoder.getWidth()
      self.description.append(("time of day", self.timeOfDayOffset))
      self.encoders.append(("time of day", self.timeOfDayEncoder, self.timeOfDayOffset))
Esempio n. 10
0
class DateEncoder(Encoder):
  """A date encoder encodes a date according to encoding parameters
  specified in its constructor.
  The input to a date encoder is a datetime.datetime object. The output
  is the concatenation of several sub-encodings, each of which encodes
  a different aspect of the date. Which sub-encodings are present, and
  details of those sub-encodings, are specified in the DateEncoder
  constructor.

  Each parameter describes one attribute to encode. By default, the attribute
  is not encoded.

  season (season of the year; units = day):
    (int) width of attribute; default radius = 91.5 days (1 season)
    (tuple)  season[0] = width; season[1] = radius

  dayOfWeek (monday = 0; units = day)
    (int) width of attribute; default radius = 1 day
    (tuple) dayOfWeek[0] = width; dayOfWeek[1] = radius

  weekend (boolean: 0, 1)
    (int) width of attribute

  holiday (boolean: 0, 1)
    (int) width of attribute

  timeOfday (midnight = 0; units = hour)
    (int) width of attribute: default radius = 4 hours
    (tuple) timeOfDay[0] = width; timeOfDay[1] = radius





  """
  ############################################################################
  def __init__(self, season=0, dayOfWeek=0, weekend=0, holiday=0, timeOfDay=0, customDays=0,
                name = ''):

    self.width = 0
    self.description = []
    self.name = name

    # This will contain a list of (name, encoder, offset) tuples for use by
    #  the decode() method
    self.encoders = []

    self.seasonEncoder = None
    if season != 0:
      # Ignore leapyear differences -- assume 366 days in a year
      # Radius = 91.5 days = length of season
      # Value is number of days since beginning of year (0 - 355)
      if hasattr(season, "__getitem__"):
        w = season[0]
        radius = season[1]
      else:
        w = season
        radius = 91.5

      self.seasonEncoder = ScalarEncoder(w = w, minval=0, maxval=366,
                                         radius=radius, periodic=True,
                                         name="season")
      self.seasonOffset = self.width
      self.width += self.seasonEncoder.getWidth()
      self.description.append(("season", self.seasonOffset))
      self.encoders.append(("season", self.seasonEncoder, self.seasonOffset))


    self.dayOfWeekEncoder = None
    if dayOfWeek != 0:
      # Value is day of week (floating point)
      # Radius is 1 day
      if hasattr(dayOfWeek, "__getitem__"):
        w = dayOfWeek[0]
        radius = dayOfWeek[1]
      else:
        w = dayOfWeek
        radius = 1
      self.dayOfWeekEncoder = ScalarEncoder(w = w, minval=0, maxval=7,
                                            radius=radius, periodic=True,
                                            name="day of week")
      self.dayOfWeekOffset = self.width
      self.width += self.dayOfWeekEncoder.getWidth()
      self.description.append(("day of week", self.dayOfWeekOffset))
      self.encoders.append(("day of week", self.dayOfWeekEncoder, self.dayOfWeekOffset))

    self.weekendEncoder = None
    if weekend != 0:
      # Binary value. Not sure if this makes sense. Also is somewhat redundant
      #  with dayOfWeek
      #Append radius if it was not provided
      if not hasattr(weekend, "__getitem__"):
        weekend = (weekend,1)
      self.weekendEncoder = ScalarEncoder(w = weekend[0], minval = 0, maxval=1,
                                          periodic=False, radius=weekend[1],
                                          name="weekend")
      self.weekendOffset = self.width
      self.width += self.weekendEncoder.getWidth()
      self.description.append(("weekend", self.weekendOffset))
      self.encoders.append(("weekend", self.weekendEncoder, self.weekendOffset))

    #Set up custom days encoder, first argument in tuple is width
    #second is either a single day of the week or a list of the days
    #you want encoded as ones.
    self.customDaysEncoder = None
    if customDays !=0:
      customDayEncoderName = ""
      daysToParse = []
      assert len(customDays)==2, "Please provide a w and the desired days"
      if isinstance(customDays[1], list):
        for day in customDays[1]:
          customDayEncoderName+=str(day)+" "
        daysToParse=customDays[1]
      elif isinstance(customDays[1], str):
        customDayEncoderName+=customDays[1]
        daysToParse = [customDays[1]]
      else:
        assert False, "You must provide either a list of days or a single day"
      #Parse days
      self.customDays = []
      for day in daysToParse:
        if(day.lower() in ["mon","monday"]):
          self.customDays+=[0]
        elif day.lower() in ["tue","tuesday"]:
          self.customDays+=[1]
        elif day.lower() in ["wed","wednesday"]:
          self.customDays+=[2]
        elif day.lower() in ["thu","thursday"]:
          self.customDays+=[3]
        elif day.lower() in ["fri","friday"]:
          self.customDays+=[4]
        elif day.lower() in ["sat","saturday"]:
          self.customDays+=[5]
        elif day.lower() in ["sun","sunday"]:
          self.customDays+=[6]
        else:
          assert False, "Unable to understand %s as a day of week" % str(day)
      self.customDaysEncoder = ScalarEncoder(w=customDays[0], minval = 0, maxval=1,
                                            periodic=False, radius=1,
                                            name=customDayEncoderName)
      self.customDaysOffset = self.width
      self.width += self.customDaysEncoder.getWidth()
      self.description.append(("customdays", self.customDaysOffset))
      self.encoders.append(("customdays", self.customDaysEncoder, self.customDaysOffset))

    self.holidayEncoder = None
    if holiday != 0:
      # A "continuous" binary value. = 1 on the holiday itself and smooth ramp
      #  0->1 on the day before the holiday and 1->0 on the day after the holiday.
      self.holidayEncoder = ScalarEncoder(w = holiday, minval = 0, maxval=1,
                                          periodic=False, radius=1,
                                          name="holiday")
      self.holidayOffset = self.width
      self.width += self.holidayEncoder.getWidth()
      self.description.append(("holiday", self.holidayOffset))
      self.encoders.append(("holiday", self.holidayEncoder, self.holidayOffset))

    self.timeOfDayEncoder = None
    if timeOfDay != 0:
      # Value is time of day in hours
      # Radius = 4 hours, e.g. morning, afternoon, evening, early night,
      #  late night, etc.
      if hasattr(timeOfDay, "__getitem__"):
        w = timeOfDay[0]
        radius = timeOfDay[1]
      else:
        w = timeOfDay
        radius = 4
      self.timeOfDayEncoder = ScalarEncoder(w = w, minval=0, maxval=24,
                              periodic=True, radius=radius, name="time of day")
      self.timeOfDayOffset = self.width
      self.width += self.timeOfDayEncoder.getWidth()
      self.description.append(("time of day", self.timeOfDayOffset))
      self.encoders.append(("time of day", self.timeOfDayEncoder, self.timeOfDayOffset))

  ############################################################################
  def getWidth(self):
    return self.width

  ############################################################################
  def getScalarNames(self, parentFieldName=''):
    """ See method description in base.py """

    names = []

    # This forms a name which is the concatenation of the parentFieldName
    #   passed in and the encoder's own name.
    def _formFieldName(encoder):
      if parentFieldName == '':
        return encoder.name
      else:
        return '%s.%s' % (parentFieldName, encoder.name)

    # -------------------------------------------------------------------------
    # Get the scalar values for each sub-field
    if self.seasonEncoder is not None:
      names.append(_formFieldName(self.seasonEncoder))

    if self.dayOfWeekEncoder is not None:
      names.append(_formFieldName(self.dayOfWeekEncoder))

    if self.customDaysEncoder is not None:
      names.append(_formFieldName(self.customDaysEncoder))

    if self.weekendEncoder is not None:
      names.append(_formFieldName(self.weekendEncoder))

    if self.holidayEncoder is not None:
      names.append(_formFieldName(self.holidayEncoder))

    if self.timeOfDayEncoder is not None:
      names.append(_formFieldName(self.timeOfDayEncoder))

    return names

  ############################################################################
  def getEncodedValues(self, input):
    """ See method description in base.py """

    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
      return numpy.array([None])

    assert isinstance(input, datetime.datetime)
    values = []

    # -------------------------------------------------------------------------
    # Get the scalar values for each sub-field
    timetuple = input.timetuple()
    timeOfDay = timetuple.tm_hour + float(timetuple.tm_min)/60.0

    if self.seasonEncoder is not None:
      dayOfYear = timetuple.tm_yday
      # input.timetuple() computes the day of year 1 based, so convert to 0 based
      values.append(dayOfYear-1)

    if self.dayOfWeekEncoder is not None:
      dayOfWeek = timetuple.tm_wday #+ timeOfDay / 24.0
      values.append(dayOfWeek)

    if self.weekendEncoder is not None:
      # saturday, sunday or friday evening
      if timetuple.tm_wday == 6 or timetuple.tm_wday == 5 \
          or (timetuple.tm_wday == 4 and timeOfDay > 18):
        weekend = 1
      else:
        weekend = 0
      values.append(weekend)

    if self.customDaysEncoder is not None:
      if timetuple.tm_wday in self.customDays:
        customDay = 1
      else:
        customDay = 0
      values.append(customDay)
    if self.holidayEncoder is not None:
      # A "continuous" binary value. = 1 on the holiday itself and smooth ramp
      #  0->1 on the day before the holiday and 1->0 on the day after the holiday.
      # Currently the only holiday we know about is December 25
      # holidays is a list of holidays that occur on a fixed date every year
      holidays = [(12, 25)]
      val = 0
      for h in holidays:
        # hdate is midnight on the holiday
        hdate = datetime.datetime(timetuple.tm_year, h[0], h[1], 0, 0, 0)
        if input > hdate:
          diff = input - hdate
          if diff.days == 0:
            # return 1 on the holiday itself
            val = 1
            break
          elif diff.days == 1:
            # ramp smoothly from 1 -> 0 on the next day
            val = 1.0 - (float(diff.seconds) / (86400))
            break
        else:
          diff = hdate - input
          if diff.days == 0:
            # ramp smoothly from 0 -> 1 on the previous day
            val = 1.0 - (float(diff.seconds) / 86400)

      values.append(val)

    if self.timeOfDayEncoder is not None:
      values.append(timeOfDay)

    return values

  ############################################################################
  def getScalars(self, input):
    """ See method description in base.py

    Parameters:
    -----------------------------------------------------------------------
    input:          A datetime object representing the time being encoded

    Returns:        A numpy array of the corresponding scalar values in
                    the following order:

                    [season, dayOfWeek, weekend, holiday, timeOfDay]

                    Note: some of these fields might be omitted if they were not
                    specified in the encoder
    """
    return numpy.array(self.getEncodedValues(input))

  ############################################################################
  def getBucketIndices(self, input):
    """ See method description in base.py """

    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
      # Encoder each sub-field
      return [None] * len(self.encoders)

    else:
      assert isinstance(input, datetime.datetime)

      # Get the scalar values for each sub-field
      scalars = self.getScalars(input)

      # Encoder each sub-field
      result = []
      for i in xrange(len(self.encoders)):
        (name, encoder, offset) = self.encoders[i]
        result.extend(encoder.getBucketIndices(scalars[i]))
      return result

  ############################################################################
  def encodeIntoArray(self, input, output):
    """ See method description in base.py """

    if input == SENTINEL_VALUE_FOR_MISSING_DATA:
      output[0:] = 0
    else:
      if not isinstance(input, datetime.datetime):
        raise ValueError("Input is type %s, expected datetime. Value: %s" % (
            type(input), str(input)))

      # Get the scalar values for each sub-field
      scalars = self.getScalars(input)

      # Encoder each sub-field
      for i in xrange(len(self.encoders)):
        (name, encoder, offset) = self.encoders[i]
        encoder.encodeIntoArray(scalars[i], output[offset:])


  ############################################################################
  def getDescription(self):
    return self.description
Esempio n. 11
0
class CategoryEncoder(Encoder):
    """Encodes a list of discrete categories (described by strings), that aren't
  related to each other, so we never emit a mixture of categories.

  The value of zero is reserved for "unknown category"

  Internally we use a ScalarEncoder with a radius of 1, but since we only encode
  integers, we never get mixture outputs.

  The SDRCategoryEncoder uses a different method to encode categories"""

    ############################################################################
    def __init__(self, w, categoryList, name="category", verbosity=0):

        self.encoders = None
        self.verbosity = verbosity

        # number of categories includes "unknown"
        self.ncategories = len(categoryList) + 1

        self.categoryToIndex = dict()
        self.indexToCategory = dict()
        self.indexToCategory[0] = "<UNKNOWN>"
        for i in xrange(len(categoryList)):
            self.categoryToIndex[categoryList[i]] = i + 1
            self.indexToCategory[i + 1] = categoryList[i]

        self.encoder = ScalarEncoder(w,
                                     minval=0,
                                     maxval=self.ncategories - 1,
                                     radius=1,
                                     periodic=False)
        self.width = w * self.ncategories
        assert self.encoder.getWidth() == self.width

        self.description = [(name, 0)]
        self.name = name

        # These are used to support the topDownCompute method
        self._topDownMappingM = None

        # This gets filled in by getBucketValues
        self._bucketValues = None

    ############################################################################
    def getDecoderOutputFieldTypes(self):
        """ [Encoder class virtual method override]
    """
        # TODO: change back to string meta-type after the decoding logic is fixed
        #       to output strings instead of internal index values.
        #return (FieldMetaType.string,)
        return (FieldMetaType.integer, )

    ############################################################################
    def getWidth(self):
        return self.width

    ############################################################################
    def getDescription(self):
        return self.description

    ############################################################################
    def getScalars(self, input):
        """ See method description in base.py """
        if input == SENTINEL_VALUE_FOR_MISSING_DATA:
            return numpy.array([None])
        else:
            return numpy.array([self.categoryToIndex.get(input, 0)])

    ############################################################################
    def getBucketIndices(self, input):
        """ See method description in base.py """

        # Get the bucket index from the underlying scalar encoder
        if input == SENTINEL_VALUE_FOR_MISSING_DATA:
            return [None]
        else:
            return self.encoder.getBucketIndices(
                self.categoryToIndex.get(input, 0))

    ############################################################################
    def encodeIntoArray(self, input, output):
        # if not found, we encode category 0
        if input == SENTINEL_VALUE_FOR_MISSING_DATA:
            output[0:] = 0
            val = "<missing>"
        else:
            val = self.categoryToIndex.get(input, 0)
            self.encoder.encodeIntoArray(val, output)

        if self.verbosity >= 2:
            print "input:", input, "va:", val, "output:", output
            print "decoded:", self.decodedToStr(self.decode(output))

    ############################################################################
    def decode(self, encoded, parentFieldName=''):
        """ See the function description in base.py
    """

        # Get the scalar values from the underlying scalar encoder
        (fieldsDict, fieldNames) = self.encoder.decode(encoded)
        if len(fieldsDict) == 0:
            return (fieldsDict, fieldNames)

        # Expect only 1 field
        assert (len(fieldsDict) == 1)

        # Get the list of categories the scalar values correspond to and
        #  generate the description from the category name(s).
        (inRanges, inDesc) = fieldsDict.values()[0]
        outRanges = []
        desc = ""
        for (minV, maxV) in inRanges:
            minV = int(round(minV))
            maxV = int(round(maxV))
            outRanges.append((minV, maxV))
            while minV <= maxV:
                if len(desc) > 0:
                    desc += ", "
                desc += self.indexToCategory[minV]
                minV += 1

        # Return result
        if parentFieldName != '':
            fieldName = "%s.%s" % (parentFieldName, self.name)
        else:
            fieldName = self.name
        return ({fieldName: (outRanges, desc)}, [fieldName])

    ############################################################################
    def closenessScores(
        self,
        expValues,
        actValues,
        fractional=True,
    ):
        """ See the function description in base.py

    kwargs will have the keyword "fractional", which is ignored by this encoder
    """

        expValue = expValues[0]
        actValue = actValues[0]

        if expValue == actValue:
            closeness = 1.0
        else:
            closeness = 0.0

        if not fractional:
            closeness = 1.0 - closeness

        #print "category::", "expValue:", expValue, "actValue:", actValue, \
        #      "closeness", closeness
        #import pdb; pdb.set_trace()

        return numpy.array([closeness])

    ############################################################################
    def getBucketValues(self):
        """ See the function description in base.py """

        if self._bucketValues is None:
            numBuckets = len(self.encoder.getBucketValues())
            self._bucketValues = []
            for bucketIndex in range(numBuckets):
                self._bucketValues.append(
                    self.getBucketInfo([bucketIndex])[0].value)

        return self._bucketValues

    ############################################################################
    def getBucketInfo(self, buckets):
        """ See the function description in base.py
    """

        # For the category encoder, the bucket index is the category index
        bucketInfo = self.encoder.getBucketInfo(buckets)[0]

        categoryIndex = int(round(bucketInfo.value))
        category = self.indexToCategory[categoryIndex]

        return [
            EncoderResult(value=category,
                          scalar=categoryIndex,
                          encoding=bucketInfo.encoding)
        ]

    ############################################################################
    def topDownCompute(self, encoded):
        """ See the function description in base.py
    """

        encoderResult = self.encoder.topDownCompute(encoded)[0]
        value = encoderResult.value
        categoryIndex = int(round(value))
        category = self.indexToCategory[categoryIndex]

        return EncoderResult(value=category,
                             scalar=categoryIndex,
                             encoding=encoderResult.encoding)
Esempio n. 12
0
class LogEncoder(Encoder):
    """A Log encoder represents a floating point value on a logarithmic (decibel)
  scale.

  valueToEncode = 10 * log10(input)

  The default resolution (minimum difference in scaled values which is guaranteed
  to propduce different outputs) is 1 decibel. For example, the scaled values 10
  and 11 will be distinguishable in the output. In terms of the original input
  values, this means 10^1 (10) and 10^1.1 (12.5) will be distinguishable.

    resolution -- encoder resolution, in terms of scaled values. Default: 1 decibel
    minval -- must be greater than 0. Lower values are reset to this value
    maxval -- Higher values are reset to this value
  """
    def __init__(self,
                 w=5,
                 resolution=1.0,
                 minval=0.10,
                 maxval=10000,
                 name="log",
                 verbosity=0):

        self.encoders = None
        self.verbosity = verbosity
        self.minScaledValue = int(10 * math.log10(minval))
        self.maxScaledValue = int(math.ceil(10 * math.log10(maxval)))
        assert self.maxScaledValue > self.minScaledValue

        self.minval = 10**(self.minScaledValue / 10.0)
        self.maxval = 10**(self.maxScaledValue / 10.0)

        # Note: passing resolution=1 causes the test to topDownCompute
        # test to fail.  Fixed for now by always converting to float,
        # but should find the root cause.
        self.encoder = ScalarEncoder(w=w,
                                     minval=self.minScaledValue,
                                     maxval=self.maxScaledValue,
                                     periodic=False,
                                     resolution=float(resolution))
        self.width = self.encoder.getWidth()
        self.description = [(name, 0)]
        self.name = name

        # This list is created by getBucketValues() the first time it is called,
        #  and re-created whenever our buckets would be re-arranged.
        self._bucketValues = None

    ############################################################################
    def getWidth(self):
        return self.width

    ############################################################################
    def getDescription(self):
        return self.description

    ############################################################################
    def _getScaledValue(self, input):
        """ Convert the input, which is in normal space, into log space
    """
        if input == SENTINEL_VALUE_FOR_MISSING_DATA:
            return None
        else:
            val = input
            if val < self.minval:
                val = self.minval
            elif val > self.maxval:
                val = self.maxval

            scaledVal = 10 * math.log10(val)
            return scaledVal

    ############################################################################
    def getBucketIndices(self, input):
        """ See the function description in base.py
    """

        # Get the scaled value
        scaledVal = self._getScaledValue(input)

        if scaledVal is None:
            return [None]
        else:
            return self.encoder.getBucketIndices(scaledVal)

    ############################################################################
    def encodeIntoArray(self, input, output):
        """ See the function description in base.py
    """

        # Get the scaled value
        scaledVal = self._getScaledValue(input)

        if scaledVal is None:
            output[0:] = 0
        else:
            self.encoder.encodeIntoArray(scaledVal, output)

            if self.verbosity >= 2:
                print "input:", input, "scaledVal:", scaledVal, "output:", output
                print "decoded:", self.decodedToStr(self.decode(output))

    ############################################################################
    def decode(self, encoded, parentFieldName=''):
        """ See the function description in base.py
    """

        # Get the scalar values from the underlying scalar encoder
        (fieldsDict, fieldNames) = self.encoder.decode(encoded)
        if len(fieldsDict) == 0:
            return (fieldsDict, fieldNames)

        # Expect only 1 field
        assert (len(fieldsDict) == 1)

        # Convert each range into normal space
        (inRanges, inDesc) = fieldsDict.values()[0]
        outRanges = []
        for (minV, maxV) in inRanges:
            outRanges.append(
                (math.pow(10, minV / 10.0), math.pow(10, maxV / 10.0)))

        # Generate a text description of the ranges
        desc = ""
        numRanges = len(outRanges)
        for i in xrange(numRanges):
            if outRanges[i][0] != outRanges[i][1]:
                desc += "%.2f-%.2f" % (outRanges[i][0], outRanges[i][1])
            else:
                desc += "%.2f" % (outRanges[i][0])
            if i < numRanges - 1:
                desc += ", "

        # Return result
        if parentFieldName != '':
            fieldName = "%s.%s" % (parentFieldName, self.name)
        else:
            fieldName = self.name
        return ({fieldName: (outRanges, desc)}, [fieldName])

    ############################################################################
    def getBucketValues(self):
        """ See the function description in base.py """

        # Need to re-create?
        if self._bucketValues is None:
            scaledValues = self.encoder.getBucketValues()
            self._bucketValues = []
            for scaledValue in scaledValues:
                value = math.pow(10, scaledValue / 10.0)
                self._bucketValues.append(value)

        return self._bucketValues

    ############################################################################
    def getBucketInfo(self, buckets):
        """ See the function description in base.py
    """

        scaledResult = self.encoder.getBucketInfo(buckets)[0]
        scaledValue = scaledResult.value
        value = math.pow(10, scaledValue / 10.0)

        return [
            EncoderResult(value=value,
                          scalar=value,
                          encoding=scaledResult.encoding)
        ]

    ############################################################################
    def topDownCompute(self, encoded):
        """ See the function description in base.py
    """

        scaledResult = self.encoder.topDownCompute(encoded)[0]
        scaledValue = scaledResult.value
        value = math.pow(10, scaledValue / 10.0)

        return EncoderResult(value=value,
                             scalar=value,
                             encoding=scaledResult.encoding)

    ############################################################################
    def closenessScores(self, expValues, actValues, fractional=True):
        """ See the function description in base.py
    """

        # Compute the percent error in log space
        if expValues[0] > 0:
            expValue = 10 * math.log10(expValues[0])
        else:
            expValue = self.minScaledValue

        if actValues[0] > 0:
            actValue = 10 * math.log10(actValues[0])
        else:
            actValue = self.minScaledValue

        if fractional:
            err = abs(expValue - actValue)
            pctErr = err / (self.maxScaledValue - self.minScaledValue)
            pctErr = min(1.0, pctErr)
            closeness = 1.0 - pctErr
        else:
            err = abs(expValue - actValue)
            closeness = err

        #print "log::", "expValue:", expValues[0], "actValue:", actValues[0], \
        #      "closeness", closeness
        #import pdb; pdb.set_trace()
        return numpy.array([closeness])