def testUnicode(self): testDocUni1 = [ "\u0395\u0396\u0397\u0398\u0399", "\u0400\u0401\u0402\u0403\u0404", "\u0405\u0406\u0407\u0408\u0409"] testDocUni2 = [ "\u0395\u0396\u0397\u0398\u0399\u0410", "\u0400\u0401\u0402\u0403\u0404\u0410", "\u0405\u0406\u0407\u0408\u0409\u0410"] params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 # unicode 'tokenSimilarity' ON params.tokenSimilarity = True encoder1 = SimHashDocumentEncoder(params) output1 = SDR(params.size) output2 = SDR(params.size) encoder1.encode(testDocUni1, output1) encoder1.encode(testDocUni2, output2) assert(output1.getOverlap(output2) > 65) # unicode 'tokenSimilarity' OFF params.tokenSimilarity = False encoder2 = SimHashDocumentEncoder(params) output1.zero() output2.zero() encoder2.encode(testDocUni1, output1) encoder2.encode(testDocUni2, output2) assert(output1.getOverlap(output2) < 65)
def testEncoding(self): params = SimHashDocumentEncoderParameters() params.size = 400 params.activeBits = 20 # main call style - list encoder = SimHashDocumentEncoder(params) output = encoder.encode(testDoc1) assert(encoder.size == params.size) assert(output.size == params.size) assert(output.getSum() == params.activeBits) # simple alternate calling style - string encoder2 = SimHashDocumentEncoder(params) value2 = "abcde fghij klmno pqrst uvwxy" output2 = encoder2.encode(value2) assert(output == output2) # encoding empty values leads to output of zeros outputZ = SDR(params.size) outputZ.zero() output3 = encoder.encode([]) output4 = encoder.encode("") assert(output3 == outputZ) assert(output4 == outputZ)
def testStr(self): A = SDR((103, )) B = SDR((100, 100, 1)) A.dense[0] = 1 A.dense[9] = 1 A.dense[102] = 1 A.dense = A.dense assert (str(A) == "SDR( 103 ) 0, 9, 102") A.zero() assert (str(A) == "SDR( 103 )") B.dense[0, 0, 0] = 1 B.dense[99, 99, 0] = 1 B.dense = B.dense assert (str(B) == "SDR( 100, 100, 1 ) 0, 9999")
def testTokenSimilarity(self): params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.caseSensitivity = True # tokenSimilarity ON params.tokenSimilarity = True encoder1 = SimHashDocumentEncoder(params) output1 = SDR(params.size) output2 = SDR(params.size) output3 = SDR(params.size) output4 = SDR(params.size) encoder1.encode(testDoc1, output1) encoder1.encode(testDoc2, output2) encoder1.encode(testDoc3, output3) encoder1.encode(testDoc4, output4) assert(output3.getOverlap(output4) > output2.getOverlap(output3)) assert(output2.getOverlap(output3) > output1.getOverlap(output3)) assert(output1.getOverlap(output3) > output1.getOverlap(output4)) # tokenSimilarity OFF params.tokenSimilarity = False encoder2 = SimHashDocumentEncoder(params) output1.zero() output2.zero() output3.zero() output4.zero() encoder2.encode(testDoc1, output1) encoder2.encode(testDoc2, output2) encoder2.encode(testDoc3, output3) encoder2.encode(testDoc4, output4) assert(output1.getOverlap(output2) > output2.getOverlap(output3)) assert(output2.getOverlap(output3) > output3.getOverlap(output4)) assert(output3.getOverlap(output4) > output1.getOverlap(output3))
def encode(self, location, grid_cells=None): """ Transform a 2-D coordinate into an SDR. Argument location: pair of coordinates, such as "[X, Y]" Argument grid_cells: Optional, the SDR object to store the results in. Its dimensions must be "[GridCellEncoder.size]" Returns grid_cells, an SDR object. This will be created if not given. """ location = list(location) assert (len(location) == 2) if grid_cells is None: grid_cells = SDR((self.size, )) else: assert (isinstance(grid_cells, SDR)) assert (grid_cells.dimensions == [self.size]) if any(math.isnan(x) for x in location): grid_cells.zero() return grid_cells # Find the distance from the location to each grid cells nearest # receptive field center. # Convert the units of location to hex grid with angle 0, scale 1, offset 0. displacement = location - self.offsets_ radius = np.empty(self.size) for mod_idx in range(len(self.partitions_)): start, stop = self.partitions_[mod_idx] R = self.rot_mats_[mod_idx] displacement[start:stop] = R.dot(displacement[start:stop].T).T radius[start:stop] = self.periods[mod_idx] / 2 # Convert into and out of hexagonal coordinates, which rounds to the # nearest hexagons center. nearest = hexy.cube_to_pixel(hexy.pixel_to_cube(displacement, radius), radius) # Find the distance between the location and the RF center. distances = np.hypot(*(nearest - displacement).T) # Activate the closest grid cells in each module. index = [] for start, stop in self.partitions_: z = int(round(self.sparsity * (stop - start))) index.extend(np.argpartition(distances[start:stop], z)[:z] + start) grid_cells.sparse = index return grid_cells
def testZero(self): A = SDR((103, )) A.sparse = list(range(20)) B = A.zero() assert (np.sum(A.dense) == 0) assert (A is B)
def encode(self, inp, output=None): """ Argument inp: (datetime) representing the time being encoded """ if output is None: output = SDR(self.dimensions) else: assert (isinstance(output, SDR)) assert (all(x == y for x, y in zip(output.dimensions, self.dimensions))) if inp is None or (isinstance(inp, float) and math.isnan(inp)): output.zero() return output elif not isinstance(inp, datetime.datetime): raise ValueError("Input is type %s, expected datetime. Value: %s" % (type(inp), str(inp))) # ------------------------------------------------------------------------- # Encode each sub-field sdrs = [] timetuple = inp.timetuple() timeOfDay = timetuple.tm_hour + float(timetuple.tm_min) / 60.0 if self.seasonEncoder is not None: # Number the days starting at zero, intead of 1 like the datetime does. dayOfYear = timetuple.tm_yday - 1 assert (dayOfYear >= 0) # dayOfYear -= self.seasonEncoder.parameters.radius / 2. # Round towards the middle of the season. sdrs.append(self.seasonEncoder.encode(dayOfYear)) if self.dayOfWeekEncoder is not None: hrs_ = float( timeOfDay ) / 24.0 # add hours as decimal value in extension to day dayOfWeek = timetuple.tm_wday + hrs_ dayOfWeek -= .5 # Round towards noon, not midnight, this means similarity of representations changes at midnights, not noon. # handle underflow: on Mon before noon -> move to Sun if dayOfWeek < 0: dayOfWeek += 7 assert (dayOfWeek >= 0 and dayOfWeek < 7) sdrs.append(self.dayOfWeekEncoder.encode(dayOfWeek)) if self.weekendEncoder is not None: # saturday, sunday or friday evening if (timetuple.tm_wday == 6 or timetuple.tm_wday == 5 or (timetuple.tm_wday == 4 and timeOfDay > 18)): weekend = 1 else: weekend = 0 sdrs.append(self.weekendEncoder.encode(weekend)) if self.customDaysEncoder is not None: if timetuple.tm_wday in self.customDays: customDay = 1 else: customDay = 0 sdrs.append(self.customDaysEncoder.encode(customDay)) if self.holidayEncoder is not None: # A "continuous" binary value. = 1 on the holiday itself and smooth ramp # 0->1 on the day before the holiday and 1->0 on the day after the holiday. # holidays is a list of holidays that occur on a fixed date every year val = 0 for h in self.holidays: # hdate is midnight on the holiday if len(h) == 3: hdate = datetime.datetime(h[0], h[1], h[2], 0, 0, 0) else: hdate = datetime.datetime(timetuple.tm_year, h[0], h[1], 0, 0, 0) if inp > hdate: diff = inp - hdate if diff.days == 0: # return 1 on the holiday itself val = 1 break elif diff.days == 1: # ramp smoothly from 1 -> 0 on the next day val = 1.0 + (float(diff.seconds) / 86400) break else: diff = hdate - inp if diff.days == 0: # ramp smoothly from 0 -> 1 on the previous day val = 1.0 - (float(diff.seconds) / 86400) sdrs.append(self.holidayEncoder.encode(val)) if self.timeOfDayEncoder is not None: sdrs.append(self.timeOfDayEncoder.encode(timeOfDay)) if len(sdrs) > 1: output.concatenate(sdrs) else: output.setSDR(sdrs[0]) return output