Esempio n. 1
0
    def testResolution(self):
        p = ScalarEncoderParameters()
        p.activeBits =  10
        p.minimum    =   0
        p.maximum    = 100
        p.resolution =  .5
        enc = ScalarEncoder(p)
        sdr1 = SDR( enc.parameters.size )
        sdr2 = SDR( enc.parameters.size )

        enc.encode( .0, sdr1 )
        enc.encode( .1, sdr2 )
        assert( sdr1 == sdr2 )

        enc.encode( .0, sdr1 )
        enc.encode( .6, sdr2 )
        assert( sdr1.getOverlap( sdr2 ) == 9 )

        enc.encode( 70,   sdr1 )
        enc.encode( 72.5, sdr2 )
        assert( sdr1.getOverlap( sdr2 ) == 5 )

        enc.encode( 70, sdr1 )
        enc.encode( 75, sdr2 )
        assert( sdr1.getOverlap( sdr2 ) == 0 )

        enc.encode( 60, sdr1 )
        enc.encode( 80, sdr2 )
        assert( sdr1.getOverlap( sdr2 ) == 0 )
Esempio n. 2
0
    def testRadius(self):
        p = ScalarEncoderParameters()
        p.activeBits =  10
        p.minimum    =   0
        p.maximum    = 100
        p.radius     =  10
        enc = ScalarEncoder(p)
        sdr1 = SDR( enc.parameters.size )
        sdr2 = SDR( enc.parameters.size )

        enc.encode( 77, sdr1 )
        enc.encode( 77, sdr2 )
        assert( sdr1.getOverlap( sdr2 ) == 10 )

        enc.encode( 0, sdr1 )
        enc.encode( 1, sdr2 )
        assert( sdr1.getOverlap( sdr2 ) == 9 )

        enc.encode( 60, sdr1 )
        enc.encode( 69, sdr2 )
        assert( sdr1.getOverlap( sdr2 ) == 1 )

        enc.encode( 45, sdr1 )
        enc.encode( 55, sdr2 )
        assert( sdr1.getOverlap( sdr2 ) == 0 )
Esempio n. 3
0
    def testUnicode(self):
        testDocUni1 = [
          "\u0395\u0396\u0397\u0398\u0399",
          "\u0400\u0401\u0402\u0403\u0404",
          "\u0405\u0406\u0407\u0408\u0409"]
        testDocUni2 = [
          "\u0395\u0396\u0397\u0398\u0399\u0410",
          "\u0400\u0401\u0402\u0403\u0404\u0410",
          "\u0405\u0406\u0407\u0408\u0409\u0410"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33

        # unicode 'tokenSimilarity' ON
        params.tokenSimilarity = True
        encoder1 = SimHashDocumentEncoder(params)
        output1 = SDR(params.size)
        output2 = SDR(params.size)
        encoder1.encode(testDocUni1, output1)
        encoder1.encode(testDocUni2, output2)
        assert(output1.getOverlap(output2) > 65)

        # unicode 'tokenSimilarity' OFF
        params.tokenSimilarity = False
        encoder2 = SimHashDocumentEncoder(params)
        output1.zero()
        output2.zero()
        encoder2.encode(testDocUni1, output1)
        encoder2.encode(testDocUni2, output2)
        assert(output1.getOverlap(output2) < 65)
Esempio n. 4
0
    def testGetOverlap(self):
        A = SDR((103, ))
        B = SDR((103, ))
        assert (A.getOverlap(B) == 0)

        A.dense[:10] = 1
        B.dense[:20] = 1
        A.dense = A.dense
        B.dense = B.dense
        assert (A.getOverlap(B) == 10)

        A.dense[:20] = 1
        A.dense = A.dense
        assert (A.getOverlap(B) == 20)

        A.dense[50:60] = 1
        B.dense[0] = 0
        A.dense = A.dense
        B.dense = B.dense
        assert (A.getOverlap(B) == 19)

        # Test wrong dimensions
        C = SDR((1, 1, 1, 1, 103))
        C.randomize(.5)
        try:
            A.getOverlap(C)
        except RuntimeError:
            pass
        else:
            self.fail()
Esempio n. 5
0
 def testAddNoise(self):
     A = SDR((103, ))
     B = SDR((103, ))
     A.randomize(.1)
     B.setSDR(A)
     A.addNoise(.5)
     assert (A.getOverlap(B) == 5)
     # Check different seed makes different results.
     A.randomize(.3, 42)
     B.randomize(.3, 42)
     A.addNoise(.5)
     B.addNoise(.5)
     assert (A != B)
     # Check same seed makes same results.
     A.randomize(.3, 42)
     B.randomize(.3, 42)
     A.addNoise(.5, 42)
     B.addNoise(.5, 42)
     assert (A == B)
     # Check that it returns itself.
     C = A.addNoise(.5)
     assert (C is A)
Esempio n. 6
0
    def testTokenSimilarity(self):
        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.caseSensitivity = True

        # tokenSimilarity ON
        params.tokenSimilarity = True
        encoder1 = SimHashDocumentEncoder(params)
        output1 = SDR(params.size)
        output2 = SDR(params.size)
        output3 = SDR(params.size)
        output4 = SDR(params.size)
        encoder1.encode(testDoc1, output1)
        encoder1.encode(testDoc2, output2)
        encoder1.encode(testDoc3, output3)
        encoder1.encode(testDoc4, output4)
        assert(output3.getOverlap(output4) > output2.getOverlap(output3))
        assert(output2.getOverlap(output3) > output1.getOverlap(output3))
        assert(output1.getOverlap(output3) > output1.getOverlap(output4))

        # tokenSimilarity OFF
        params.tokenSimilarity = False
        encoder2 = SimHashDocumentEncoder(params)
        output1.zero()
        output2.zero()
        output3.zero()
        output4.zero()
        encoder2.encode(testDoc1, output1)
        encoder2.encode(testDoc2, output2)
        encoder2.encode(testDoc3, output3)
        encoder2.encode(testDoc4, output4)
        assert(output1.getOverlap(output2) > output2.getOverlap(output3))
        assert(output2.getOverlap(output3) > output3.getOverlap(output4))
        assert(output3.getOverlap(output4) > output1.getOverlap(output3))