def testTokenVocabulary(self): vocabulary = { "a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6, "g": 1, "h": 2, "i": 3, "j": 4, "k": 5, "l": 6} input1 = "a b c d e f" input2 = "a b c d e f t u w x y z" params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.vocabulary = vocabulary # vocabulary +encodeOrphans params.encodeOrphans = True encoder1 = SimHashDocumentEncoder(params) output1a = encoder1.encode(input1) output1b = encoder1.encode(input2) assert(output1a != output1b) # vocabulary -encodeOrphans params.encodeOrphans = False encoder2 = SimHashDocumentEncoder(params) output2a = encoder2.encode(input1) output2b = encoder2.encode(input2) assert(output2a == output2b)
def testSerializeString(self): vocab = { "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4, "sound": 3} document = [ "hear", "any", "sound", "sound", "louder", "but", "walls"] params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.encodeOrphans = True params.vocabulary = vocab enc1 = SimHashDocumentEncoder(params) serialized = enc1.writeToString() output1 = enc1.encode(document) params.size = 40 params.sparsity = 0.1 enc2 = SimHashDocumentEncoder(params) assert(enc1.size != enc2.size) assert(enc1.parameters.size != enc2.parameters.size) assert(enc1.parameters.activeBits != enc2.parameters.activeBits) enc2.loadFromString(serialized) output2 = enc1.encode(document) assert(enc1.size == enc2.size) assert(enc1.parameters.size == enc2.parameters.size) assert(enc1.parameters.activeBits == enc2.parameters.activeBits) assert(output1 == output2)
def testSerializeToFile(self): vocab = { "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4, "sound": 3} document = [ "hear", "any", "sound", "sound", "louder", "but", "walls"] params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.encodeOrphans = True params.vocabulary = vocab enc1 = SimHashDocumentEncoder(params) # The SimHashDocumentEncoder now has some data in it, try serialization. file = "SimHashDocumentEncoder_test_save2.json" enc1.saveToFile(file, "JSON") output1 = enc1.encode(document) # change the parameters so we know the params were replaced from contents in file. # Note: we should have a constructor without parameters for this situation. params.size = 10 params.sparsity = 0.5 enc2 = SimHashDocumentEncoder(params) enc2.loadFromFile(file, "JSON") os.remove(file) output2 = enc2.encode(document) assert(enc1.size == enc2.size) assert(enc1.parameters.size == enc2.parameters.size) assert(enc1.parameters.activeBits == enc2.parameters.activeBits)
def testTokenCaseSensitivity(self): # Case-sensitivite strings testDocCase1 = [ "alpha", "bravo", "delta", "echo", "foxtrot", "hotel"] testDocCase2 = [ "ALPHA", "BRAVO", "DELTA", "ECHO", "FOXTROT", "HOTEL"] part = ["eCHo", "foXTROt", "hOtEl"] discard = ["AlPHa", "BRaVo", "dELTa"] vocab = {"EcHo": 1, "FOxtRoT": 1, "HoTeL": 1} # caseSensitivity ON params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.caseSensitivity = True encoder1 = SimHashDocumentEncoder(params) output1 = encoder1.encode(testDocCase1) output2 = encoder1.encode(testDocCase2) assert(output1 != output2) # caseSensitivity OFF params.caseSensitivity = False encoder2 = SimHashDocumentEncoder(params) output1 = encoder2.encode(testDocCase1) output2 = encoder2.encode(testDocCase2) assert(output1 == output2) # caseSensitivity=OFF +excludes params.excludes = discard encoder3 = SimHashDocumentEncoder(params) output3a = encoder3.encode(testDocCase1) output3b = encoder3.encode(part) assert(output3a == output3b) # caseSensitivity=OFF +vocabulary params4 = SimHashDocumentEncoderParameters() params4.size = 400 params4.sparsity = 0.33 params4.caseSensitivity = False params4.encodeOrphans = False params4.vocabulary = vocab encoder4 = SimHashDocumentEncoder(params4) output4a = encoder4.encode(testDocCase1) output4b = encoder4.encode(part) assert(output4a == output4b)
def testTokenWeightMap(self): weights = { "aaa": 4, "bbb": 2, "ccc": 2, "ddd": 4, "eee": 2, "fff": 2, "sss": 1} doc1 = ["aaa", "bbb", "ccc", "ddd", "sss"] doc2 = ["eee", "bbb", "ccc", "fff", "sss"] doc3 = ["aaa", "eee", "fff", "ddd"] params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.tokenSimilarity = False params.encodeOrphans = False params.vocabulary = weights encoder = SimHashDocumentEncoder(params) output1 = encoder.encode(doc1) output2 = encoder.encode(doc2) output3 = encoder.encode(doc3) assert(output1.getOverlap(output3) > output1.getOverlap(output2)) assert(output1.getOverlap(output2) > output2.getOverlap(output3))
def testSerializePickle(self): vocab = { "hear": 2, "nothing": 4, "but": 1, "a": 1, "rushing": 4, "sound": 3} document = [ "hear", "any", "sound", "sound", "louder", "but", "walls"] params = SimHashDocumentEncoderParameters() params.size = 400 params.sparsity = 0.33 params.encodeOrphans = True params.vocabulary = vocab enc1 = SimHashDocumentEncoder(params) pickled = pickle.dumps(enc1) output1 = enc1.encode(document) enc2 = pickle.loads(pickled) output2 = enc2.encode(document) assert(enc1.size == enc2.size) assert(enc1.parameters.size == enc2.parameters.size) assert(enc1.parameters.activeBits == enc2.parameters.activeBits) assert(output1 == output2)