Exemple #1
0
    def testTokenVocabulary(self):
        vocabulary = {
          "a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6,
          "g": 1, "h": 2, "i": 3, "j": 4, "k": 5, "l": 6}
        input1 = "a b c d e f"
        input2 = "a b c d e f t u w x y z"

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.vocabulary = vocabulary

        # vocabulary +encodeOrphans
        params.encodeOrphans = True
        encoder1 = SimHashDocumentEncoder(params)
        output1a = encoder1.encode(input1)
        output1b = encoder1.encode(input2)
        assert(output1a != output1b)

        # vocabulary -encodeOrphans
        params.encodeOrphans = False
        encoder2 = SimHashDocumentEncoder(params)
        output2a = encoder2.encode(input1)
        output2b = encoder2.encode(input2)
        assert(output2a == output2b)
Exemple #2
0
    def testSerializeString(self):
        vocab = {
            "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4,
            "sound": 3}
        document = [
            "hear", "any", "sound", "sound", "louder", "but", "walls"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.encodeOrphans = True
        params.vocabulary = vocab

        enc1 = SimHashDocumentEncoder(params)
        serialized = enc1.writeToString()
        output1 = enc1.encode(document)

        params.size = 40
        params.sparsity = 0.1
        enc2 = SimHashDocumentEncoder(params)

        assert(enc1.size != enc2.size)
        assert(enc1.parameters.size != enc2.parameters.size)
        assert(enc1.parameters.activeBits != enc2.parameters.activeBits)

        enc2.loadFromString(serialized)
        output2 = enc1.encode(document)

        assert(enc1.size == enc2.size)
        assert(enc1.parameters.size == enc2.parameters.size)
        assert(enc1.parameters.activeBits == enc2.parameters.activeBits)
        assert(output1 == output2)
    def testSerializeToFile(self):
        vocab = {
            "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4,
            "sound": 3}
        document = [
            "hear", "any", "sound", "sound", "louder", "but", "walls"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.encodeOrphans = True
        params.vocabulary = vocab

        enc1 = SimHashDocumentEncoder(params)
        # The SimHashDocumentEncoder now has some data in it, try serialization.
        file = "SimHashDocumentEncoder_test_save2.json"
        enc1.saveToFile(file, "JSON")
        output1 = enc1.encode(document)
        
        # change the parameters so we know the params were replaced from contents in file.
        # Note: we should have a constructor without parameters for this situation.
        params.size = 10
        params.sparsity = 0.5  
        enc2 = SimHashDocumentEncoder(params)
        enc2.loadFromFile(file, "JSON")
        os.remove(file)
        
        output2 = enc2.encode(document)
        assert(enc1.size == enc2.size)
        assert(enc1.parameters.size == enc2.parameters.size)
        assert(enc1.parameters.activeBits == enc2.parameters.activeBits)
Exemple #4
0
    def testTokenCaseSensitivity(self):
        # Case-sensitivite strings
        testDocCase1 = [
            "alpha", "bravo",  "delta",  "echo",  "foxtrot", "hotel"]
        testDocCase2 = [
            "ALPHA", "BRAVO",  "DELTA",  "ECHO",  "FOXTROT", "HOTEL"]
        part = ["eCHo", "foXTROt", "hOtEl"]
        discard = ["AlPHa", "BRaVo", "dELTa"]
        vocab = {"EcHo": 1, "FOxtRoT": 1, "HoTeL": 1}

        # caseSensitivity ON
        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.caseSensitivity = True
        encoder1 = SimHashDocumentEncoder(params)
        output1 = encoder1.encode(testDocCase1)
        output2 = encoder1.encode(testDocCase2)
        assert(output1 != output2)

        # caseSensitivity OFF
        params.caseSensitivity = False
        encoder2 = SimHashDocumentEncoder(params)
        output1 = encoder2.encode(testDocCase1)
        output2 = encoder2.encode(testDocCase2)
        assert(output1 == output2)

        # caseSensitivity=OFF +excludes
        params.excludes = discard
        encoder3 = SimHashDocumentEncoder(params)
        output3a = encoder3.encode(testDocCase1)
        output3b = encoder3.encode(part)
        assert(output3a == output3b)

        # caseSensitivity=OFF +vocabulary
        params4 = SimHashDocumentEncoderParameters()
        params4.size = 400
        params4.sparsity = 0.33
        params4.caseSensitivity = False
        params4.encodeOrphans = False
        params4.vocabulary = vocab
        encoder4 = SimHashDocumentEncoder(params4)
        output4a = encoder4.encode(testDocCase1)
        output4b = encoder4.encode(part)
        assert(output4a == output4b)
Exemple #5
0
    def testTokenWeightMap(self):
        weights = {
          "aaa": 4, "bbb": 2, "ccc": 2, "ddd": 4, "eee": 2, "fff": 2, "sss": 1}
        doc1 = ["aaa", "bbb", "ccc", "ddd", "sss"]
        doc2 = ["eee", "bbb", "ccc", "fff", "sss"]
        doc3 = ["aaa", "eee", "fff", "ddd"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.tokenSimilarity = False
        params.encodeOrphans = False
        params.vocabulary = weights
        encoder = SimHashDocumentEncoder(params)

        output1 = encoder.encode(doc1)
        output2 = encoder.encode(doc2)
        output3 = encoder.encode(doc3)

        assert(output1.getOverlap(output3) > output1.getOverlap(output2))
        assert(output1.getOverlap(output2) > output2.getOverlap(output3))
Exemple #6
0
    def testSerializePickle(self):
        vocab = {
            "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4,
            "sound": 3}
        document = [
            "hear", "any", "sound", "sound", "louder", "but", "walls"]

        params = SimHashDocumentEncoderParameters()
        params.size = 400
        params.sparsity = 0.33
        params.encodeOrphans = True
        params.vocabulary = vocab

        enc1 = SimHashDocumentEncoder(params)
        pickled = pickle.dumps(enc1)
        output1 = enc1.encode(document)

        enc2 = pickle.loads(pickled)
        output2 = enc2.encode(document)

        assert(enc1.size == enc2.size)
        assert(enc1.parameters.size == enc2.parameters.size)
        assert(enc1.parameters.activeBits == enc2.parameters.activeBits)
        assert(output1 == output2)