def setUp(self): self.dimension, self.signatureLength = 50, 23 self.phraseTextAndDimensionMap = TwoWayMap() for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i) self.unitRandomVectors = [ VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength) ] self.doc1 = Document( 1, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) self.doc2 = Document( 2, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.pm = SignaturePermutationWithTrie( signatureLength=self.signatureLength) self.pm.addDocument(self.doc1) self.pm.addDocument(self.doc2)
def test_addDocument_newKey(self): doc1 = Document( 1, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength) pm.addDocument(doc1) self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()], set([1]))
def __init__(self, **settings): self.settings = settings self.nearestNeighborThreshold = settings['nearest_neighbor_threshold'] self.unitVector = RandomGaussianUnitVector( dimensions=settings['dimensions'], mu=0, sigma=1) self.vectorPermutations = VectorPermutation.getPermutations( settings['signature_length'], settings['dimensions'], self.unitVector) # self.signaturePermutations = [SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations'])] signatureType = settings.get('signature_type', 'signature_type_trie') if signatureType == 'signature_type_trie': self.signaturePermutations = [ SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations']) ] else: self.signaturePermutations = [ SignaturePermutationWithSortedList( settings['signature_length']) for i in range(settings['number_of_permutations']) ] self.phraseTextAndDimensionMap = TwoWayMap() self.documentIdToDocumentMap = {}
def setUp(self): self.dimension, self.signatureLength = 50, 23 self.phraseTextAndDimensionMap = TwoWayMap() for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i) self.unitRandomVectors = [VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength)] self.doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) self.doc2=Document(2, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap); self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength) self.pm.addDocument(self.doc1) self.pm.addDocument(self.doc2)
def __init__(self, **clustering_settings): self.thresholdForDocumentToBeInACluster = clustering_settings[ 'threshold_for_document_to_be_in_cluster'] self.unitVector = RandomGaussianUnitVector( dimensions=clustering_settings['dimensions'], mu=0, sigma=1) self.vectorPermutations = VectorPermutation.getPermutations( clustering_settings['signature_length'], clustering_settings['dimensions'], self.unitVector) signatureType = clustering_settings.get('signature_type', 'signature_type_trie') if signatureType == 'signature_type_trie': self.signaturePermutations = [ SignaturePermutationWithTrie( clustering_settings['signature_length']) for i in range(clustering_settings['number_of_permutations']) ] else: self.signaturePermutations = [ SignaturePermutationWithSortedList( clustering_settings['signature_length']) for i in range(clustering_settings['number_of_permutations']) ] self.phraseTextAndDimensionMap, self.clusters = TwoWayMap(), {} self.clustering_settings = clustering_settings
def test_permutate(self): sgnt = Signature('1001011') self.assertTrue(sgnt.count() == sgnt.permutate( SignaturePermutationWithTrie(7)).count())
def test_getNearestDocument_emptyTrie(self): permutationWithEmptyTrie = SignaturePermutationWithTrie( signatureLength=self.signatureLength) self.assertEqual( permutationWithEmptyTrie.getNearestDocuments(self.doc1), set())
class SignaturePermutationTests(unittest.TestCase): def setUp(self): self.dimension, self.signatureLength = 50, 23 self.phraseTextAndDimensionMap = TwoWayMap() for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i) self.unitRandomVectors = [ VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength) ] self.doc1 = Document( 1, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) self.doc2 = Document( 2, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.pm = SignaturePermutationWithTrie( signatureLength=self.signatureLength) self.pm.addDocument(self.doc1) self.pm.addDocument(self.doc2) def test_addDocument_newKey(self): doc1 = Document( 1, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength) pm.addDocument(doc1) self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()], set([1])) def test_addDocument_existingKey(self): newDocModifiedWithExistingSignature = Document( 3, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) newDocModifiedWithExistingSignature.signature = Signature( self.doc1.signature.to01()) self.pm.addDocument(newDocModifiedWithExistingSignature) self.assertEqual( self.pm.signatureTrie[self.doc1.signature.permutate( self.pm).to01()], set([1, 3])) def test_getNearestDocument_usingAKeyAlreadyInTrie(self): self.assertEqual(self.pm.getNearestDocuments(self.doc1), set([1])) def test_getNearestDocument_usingANearbyKeyInTrie(self): digitReplacement = {'0': '1', '1': '0'} newDocWithANearbySignature = Document( 3, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) exactSignature = self.doc1.signature.to01() newDocWithANearbySignature.signature = Signature( exactSignature[:-1] + digitReplacement[exactSignature[-1]]) self.assertNotEquals(self.doc1.signature.to01(), newDocWithANearbySignature.signature.to01()) self.assertEqual( self.pm.getNearestDocuments(newDocWithANearbySignature), set([1]) ) # This assertion can sometimes fail because of randomization. Run the tests again. It's OK! def test_getNearestDocument_emptyTrie(self): permutationWithEmptyTrie = SignaturePermutationWithTrie( signatureLength=self.signatureLength) self.assertEqual( permutationWithEmptyTrie.getNearestDocuments(self.doc1), set()) def test_removeDocument_documents(self): newDocModifiedWithExistingSignature = Document( 3, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) newDocModifiedWithExistingSignature.signature = Signature( self.doc1.signature.to01()) self.pm.addDocument(newDocModifiedWithExistingSignature) self.assertEqual( self.pm.signatureTrie[self.doc1.signature.permutate( self.pm).to01()], set([1, 3])) self.pm.removeDocument(newDocModifiedWithExistingSignature) self.assertEqual( self.pm.signatureTrie[self.doc1.signature.permutate( self.pm).to01()], set([1])) self.pm.removeDocument(self.doc1) self.assertEqual( None, self.pm.signatureTrie.get( self.doc1.signature.permutate(self.pm).to01())) def test_resetSignatureTrie(self): self.assertTrue(len(self.pm.signatureTrie) > 0) self.pm.resetSignatureDataStructure() self.assertTrue(len(self.pm.signatureTrie) == 0)
class SignaturePermutationTests(unittest.TestCase): def setUp(self): self.dimension, self.signatureLength = 50, 23 self.phraseTextAndDimensionMap = TwoWayMap() for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i) self.unitRandomVectors = [VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength)] self.doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) self.doc2=Document(2, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap); self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength) self.pm.addDocument(self.doc1) self.pm.addDocument(self.doc2) def test_addDocument_newKey(self): doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength) pm.addDocument(doc1) self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()], set([1])) def test_addDocument_existingKey(self): newDocModifiedWithExistingSignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) newDocModifiedWithExistingSignature.signature = Signature(self.doc1.signature.to01()) self.pm.addDocument(newDocModifiedWithExistingSignature) self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1, 3])) def test_getNearestDocument_usingAKeyAlreadyInTrie(self): self.assertEqual(self.pm.getNearestDocuments(self.doc1), set([1])) def test_getNearestDocument_usingANearbyKeyInTrie(self): digitReplacement = {'0': '1', '1': '0'} newDocWithANearbySignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) exactSignature = self.doc1.signature.to01() newDocWithANearbySignature.signature = Signature(exactSignature[:-1]+digitReplacement[exactSignature[-1]]) self.assertNotEquals(self.doc1.signature.to01(), newDocWithANearbySignature.signature.to01()) self.assertEqual(self.pm.getNearestDocuments(newDocWithANearbySignature), set([1])) # This assertion can sometimes fail because of randomization. Run the tests again. It's OK! def test_getNearestDocument_emptyTrie(self): permutationWithEmptyTrie = SignaturePermutationWithTrie(signatureLength=self.signatureLength) self.assertEqual(permutationWithEmptyTrie.getNearestDocuments(self.doc1), set()) def test_removeDocument_documents(self): newDocModifiedWithExistingSignature = Document(3, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) newDocModifiedWithExistingSignature.signature = Signature(self.doc1.signature.to01()) self.pm.addDocument(newDocModifiedWithExistingSignature) self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1, 3])) self.pm.removeDocument(newDocModifiedWithExistingSignature) self.assertEqual(self.pm.signatureTrie[self.doc1.signature.permutate(self.pm).to01()], set([1])) self.pm.removeDocument(self.doc1) self.assertEqual(None, self.pm.signatureTrie.get(self.doc1.signature.permutate(self.pm).to01())) def test_resetSignatureTrie(self): self.assertTrue(len(self.pm.signatureTrie)>0) self.pm.resetSignatureDataStructure() self.assertTrue(len(self.pm.signatureTrie)==0)
def test_getNearestDocument_emptyTrie(self): permutationWithEmptyTrie = SignaturePermutationWithTrie(signatureLength=self.signatureLength) self.assertEqual(permutationWithEmptyTrie.getNearestDocuments(self.doc1), set())
def test_addDocument_newKey(self): doc1=Document(1, VectorGenerator.getRandomGaussianUnitVector(dimension=self.dimension, mu=0, sigma=1)) doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) pm = SignaturePermutationWithTrie(signatureLength=self.signatureLength) pm.addDocument(doc1) self.assertEqual(pm.signatureTrie[doc1.signature.permutate(pm).to01()], set([1]))
def offlineLSHClusteringDemo(): wordToDimensionMap = {} def createDocumentFromLine(docId, line): vector = Vector() words = line.split() for word in words[1:]: if word not in wordToDimensionMap: wordToDimensionMap[word] = len(wordToDimensionMap) wordDimension = wordToDimensionMap[word] if wordDimension not in vector: vector[wordDimension] = 1 else: vector[wordDimension] += 1 return Document(docId, vector, clusterId=words[0]) dimensions = 53 signatureLength = 13 numberOfPermutations = 5 unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations( signatureLength, dimensions, unitVector) signaturePermutations = [ SignaturePermutationWithTrie(signatureLength) for i in range(numberOfPermutations) ] permutatedUnitVectors = [ unitVector.getPermutedVector(r) for r in vectorPermutations ] # Build LSH Model. # Read training documents. traningDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/train_offline.dat')): traningDocumentsMap[docId] = createDocumentFromLine(docId, l) # Construct cluster vectors. clusterToDocumentsMap = defaultdict(list) for document in traningDocumentsMap.values(): clusterToDocumentsMap[document.clusterId].append(document) clusterMap = {} for k, v in clusterToDocumentsMap.iteritems(): clusterMap[k] = Document(docId=k, vector=Vector.getMeanVector(v), clusterId=k) # Create signatures and signaturePermutations for all the clusters. map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), clusterMap.values()) for permutation in signaturePermutations: for document in clusterMap.values(): permutation.addDocument(document) # Testing the model. # Read testing documents. testDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/test_offline.dat')): testDocumentsMap[docId] = createDocumentFromLine(docId, l) # Create signatures for test documents map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), testDocumentsMap.values()) predicted, labels = [], [] for t in testDocumentsMap.values(): possibleNearestClusters = reduce( lambda x, y: x.union(y), (permutation.getNearestDocuments(t) for permutation in signaturePermutations), set()) predictedClass = max( ((clusterId, clusterMap[clusterId].cosineSimilarity(t)) for clusterId in possibleNearestClusters), key=itemgetter(1)) predicted.append(predictedClass[0]) labels.append(t.clusterId) return EvaluationMetrics.purity(predicted, labels)