Esempio n. 1
0
    def testUnicode(self):
        testDocUni1 = [
          "\u0395\u0396\u0397\u0398\u0399",
          "\u0400\u0401\u0402\u0403\u0404",
          "\u0405\u0406\u0407\u0408\u0409"]
        testDocUni2 = [
          "\u0395\u0396\u0397\u0398\u0399\u0410",
          "\u0400\u0401\u0402\u0403\u0404\u0410",
          "\u0405\u0406\u0407\u0408\u0409\u0410"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33

        # unicode 'tokenSimilarity' ON
        params.tokenSimilarity = True
        encoder1 = SimHashDocumentEncoder(params)
        output1 = SDR(params.size)
        output2 = SDR(params.size)
        encoder1.encode(testDocUni1, output1)
        encoder1.encode(testDocUni2, output2)
        assert(output1.getOverlap(output2) > 65)

        # unicode 'tokenSimilarity' OFF
        params.tokenSimilarity = False
        encoder2 = SimHashDocumentEncoder(params)
        output1.zero()
        output2.zero()
        encoder2.encode(testDocUni1, output1)
        encoder2.encode(testDocUni2, output2)
        assert(output1.getOverlap(output2) < 65)
Esempio n. 2
0
    def testEncoding(self):
        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.activeBits = 20

        # main call style - list
        encoder = SimHashDocumentEncoder(params)
        output = encoder.encode(testDoc1)
        assert(encoder.size == params.size)
        assert(output.size == params.size)
        assert(output.getSum() == params.activeBits)

        # simple alternate calling style - string
        encoder2 = SimHashDocumentEncoder(params)
        value2 = "abcde fghij klmno pqrst uvwxy"
        output2 = encoder2.encode(value2)
        assert(output == output2)

        # encoding empty values leads to output of zeros
        outputZ = SDR(params.size)
        outputZ.zero()
        output3 = encoder.encode([])
        output4 = encoder.encode("")
        assert(output3 == outputZ)
        assert(output4 == outputZ)
Esempio n. 3
0
 def testStr(self):
     A = SDR((103, ))
     B = SDR((100, 100, 1))
     A.dense[0] = 1
     A.dense[9] = 1
     A.dense[102] = 1
     A.dense = A.dense
     assert (str(A) == "SDR( 103 ) 0, 9, 102")
     A.zero()
     assert (str(A) == "SDR( 103 )")
     B.dense[0, 0, 0] = 1
     B.dense[99, 99, 0] = 1
     B.dense = B.dense
     assert (str(B) == "SDR( 100, 100, 1 ) 0, 9999")
Esempio n. 4
0
    def testTokenSimilarity(self):
        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.caseSensitivity = True

        # tokenSimilarity ON
        params.tokenSimilarity = True
        encoder1 = SimHashDocumentEncoder(params)
        output1 = SDR(params.size)
        output2 = SDR(params.size)
        output3 = SDR(params.size)
        output4 = SDR(params.size)
        encoder1.encode(testDoc1, output1)
        encoder1.encode(testDoc2, output2)
        encoder1.encode(testDoc3, output3)
        encoder1.encode(testDoc4, output4)
        assert(output3.getOverlap(output4) > output2.getOverlap(output3))
        assert(output2.getOverlap(output3) > output1.getOverlap(output3))
        assert(output1.getOverlap(output3) > output1.getOverlap(output4))

        # tokenSimilarity OFF
        params.tokenSimilarity = False
        encoder2 = SimHashDocumentEncoder(params)
        output1.zero()
        output2.zero()
        output3.zero()
        output4.zero()
        encoder2.encode(testDoc1, output1)
        encoder2.encode(testDoc2, output2)
        encoder2.encode(testDoc3, output3)
        encoder2.encode(testDoc4, output4)
        assert(output1.getOverlap(output2) > output2.getOverlap(output3))
        assert(output2.getOverlap(output3) > output3.getOverlap(output4))
        assert(output3.getOverlap(output4) > output1.getOverlap(output3))
Esempio n. 5
0
    def encode(self, location, grid_cells=None):
        """
        Transform a 2-D coordinate into an SDR.

        Argument location: pair of coordinates, such as "[X, Y]"

        Argument grid_cells: Optional, the SDR object to store the results in.
                             Its dimensions must be "[GridCellEncoder.size]"

        Returns grid_cells, an SDR object.  This will be created if not given.
        """
        location = list(location)
        assert (len(location) == 2)
        if grid_cells is None:
            grid_cells = SDR((self.size, ))
        else:
            assert (isinstance(grid_cells, SDR))
            assert (grid_cells.dimensions == [self.size])
        if any(math.isnan(x) for x in location):
            grid_cells.zero()
            return grid_cells

        # Find the distance from the location to each grid cells nearest
        # receptive field center.
        # Convert the units of location to hex grid with angle 0, scale 1, offset 0.
        displacement = location - self.offsets_
        radius = np.empty(self.size)
        for mod_idx in range(len(self.partitions_)):
            start, stop = self.partitions_[mod_idx]
            R = self.rot_mats_[mod_idx]
            displacement[start:stop] = R.dot(displacement[start:stop].T).T
            radius[start:stop] = self.periods[mod_idx] / 2
        # Convert into and out of hexagonal coordinates, which rounds to the
        # nearest hexagons center.
        nearest = hexy.cube_to_pixel(hexy.pixel_to_cube(displacement, radius),
                                     radius)
        # Find the distance between the location and the RF center.
        distances = np.hypot(*(nearest - displacement).T)
        # Activate the closest grid cells in each module.
        index = []
        for start, stop in self.partitions_:
            z = int(round(self.sparsity * (stop - start)))
            index.extend(np.argpartition(distances[start:stop], z)[:z] + start)
        grid_cells.sparse = index
        return grid_cells
Esempio n. 6
0
 def testZero(self):
     A = SDR((103, ))
     A.sparse = list(range(20))
     B = A.zero()
     assert (np.sum(A.dense) == 0)
     assert (A is B)
Esempio n. 7
0
    def encode(self, inp, output=None):
        """
    Argument inp: (datetime) representing the time being encoded
    """
        if output is None:
            output = SDR(self.dimensions)
        else:
            assert (isinstance(output, SDR))
            assert (all(x == y
                        for x, y in zip(output.dimensions, self.dimensions)))

        if inp is None or (isinstance(inp, float) and math.isnan(inp)):
            output.zero()
            return output

        elif not isinstance(inp, datetime.datetime):
            raise ValueError("Input is type %s, expected datetime. Value: %s" %
                             (type(inp), str(inp)))

        # -------------------------------------------------------------------------
        # Encode each sub-field
        sdrs = []
        timetuple = inp.timetuple()
        timeOfDay = timetuple.tm_hour + float(timetuple.tm_min) / 60.0

        if self.seasonEncoder is not None:
            # Number the days starting at zero, intead of 1 like the datetime does.
            dayOfYear = timetuple.tm_yday - 1
            assert (dayOfYear >= 0)
            # dayOfYear -= self.seasonEncoder.parameters.radius / 2. # Round towards the middle of the season.
            sdrs.append(self.seasonEncoder.encode(dayOfYear))

        if self.dayOfWeekEncoder is not None:
            hrs_ = float(
                timeOfDay
            ) / 24.0  # add hours as decimal value in extension to day
            dayOfWeek = timetuple.tm_wday + hrs_
            dayOfWeek -= .5  # Round towards noon, not midnight, this means similarity of representations changes at midnights, not noon.
            # handle underflow: on Mon before noon -> move to Sun
            if dayOfWeek < 0:
                dayOfWeek += 7
            assert (dayOfWeek >= 0 and dayOfWeek < 7)
            sdrs.append(self.dayOfWeekEncoder.encode(dayOfWeek))

        if self.weekendEncoder is not None:
            # saturday, sunday or friday evening
            if (timetuple.tm_wday == 6 or timetuple.tm_wday == 5
                    or (timetuple.tm_wday == 4 and timeOfDay > 18)):
                weekend = 1
            else:
                weekend = 0
            sdrs.append(self.weekendEncoder.encode(weekend))

        if self.customDaysEncoder is not None:
            if timetuple.tm_wday in self.customDays:
                customDay = 1
            else:
                customDay = 0
            sdrs.append(self.customDaysEncoder.encode(customDay))

        if self.holidayEncoder is not None:
            # A "continuous" binary value. = 1 on the holiday itself and smooth ramp
            #  0->1 on the day before the holiday and 1->0 on the day after the holiday.
            # holidays is a list of holidays that occur on a fixed date every year
            val = 0
            for h in self.holidays:
                # hdate is midnight on the holiday
                if len(h) == 3:
                    hdate = datetime.datetime(h[0], h[1], h[2], 0, 0, 0)
                else:
                    hdate = datetime.datetime(timetuple.tm_year, h[0], h[1], 0,
                                              0, 0)
                if inp > hdate:
                    diff = inp - hdate
                    if diff.days == 0:
                        # return 1 on the holiday itself
                        val = 1
                        break
                    elif diff.days == 1:
                        # ramp smoothly from 1 -> 0 on the next day
                        val = 1.0 + (float(diff.seconds) / 86400)
                        break
                else:
                    diff = hdate - inp
                    if diff.days == 0:
                        # ramp smoothly from 0 -> 1 on the previous day
                        val = 1.0 - (float(diff.seconds) / 86400)

            sdrs.append(self.holidayEncoder.encode(val))

        if self.timeOfDayEncoder is not None:
            sdrs.append(self.timeOfDayEncoder.encode(timeOfDay))

        if len(sdrs) > 1:
            output.concatenate(sdrs)
        else:
            output.setSDR(sdrs[0])
        return output