class RandomGaussianUnitVectorTests(unittest.TestCase): def setUp(self): self.vector = RandomGaussianUnitVector(dimensions=5, mu=0, sigma=1) self.permutation = VectorPermutation(dimensions=5) def test_initialization(self): self.assertEquals('%0.0f'%self.vector.mod(),'1') def test_getPermutedDimensionValue(self): self.assertEqual(self.vector[self.permutation.applyFunction(10)], self.vector.getPermutedDimensionValue(self.permutation, 10)) def test_getPermutedVector(self): permutedVector = self.vector.getPermutedVector(self.permutation) self.assertEqual(RandomGaussianUnitVector, type(permutedVector)) self.assertNotEqual(self.vector, permutedVector) self.assertEqual('1', '%0.0f'%permutedVector.mod()) def test_isPermutationSameAsVector(self): self.permutation.a=1 self.permutation.b=0 self.assertTrue(self.vector.isPermutationSameAsVector(self.permutation))
def __init__(self, **settings): self.settings = settings self.nearestNeighborThreshold = settings['nearest_neighbor_threshold'] self.unitVector = RandomGaussianUnitVector( dimensions=settings['dimensions'], mu=0, sigma=1) self.vectorPermutations = VectorPermutation.getPermutations( settings['signature_length'], settings['dimensions'], self.unitVector) # self.signaturePermutations = [SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations'])] signatureType = settings.get('signature_type', 'signature_type_trie') if signatureType == 'signature_type_trie': self.signaturePermutations = [ SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations']) ] else: self.signaturePermutations = [ SignaturePermutationWithSortedList( settings['signature_length']) for i in range(settings['number_of_permutations']) ] self.phraseTextAndDimensionMap = TwoWayMap() self.documentIdToDocumentMap = {}
def test_setSignatureUsingVectorPermutations(self): dimensions, signatureLength = 53, 13 phraseTextAndDimensionMap = TwoWayMap() for i in range(dimensions): phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i,i) phraseTextAndDimensionMapWithMissingDimensions = TwoWayMap() for i in range(dimensions-50): phraseTextAndDimensionMapWithMissingDimensions.set(TwoWayMap.MAP_FORWARD, i,i) unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations(signatureLength, dimensions, unitVector) permutatedUnitVectors = [unitVector.getPermutedVector(r) for r in vectorPermutations] documentVector = VectorGenerator.getRandomGaussianUnitVector(dimension=dimensions, mu=0, sigma=1) documentWithSignatureByVectors=Document(1, documentVector) documentWithSignatureByVectorPermutations=Document(2, documentVector) documentWithSignatureByVectors.setSignatureUsingVectors(permutatedUnitVectors, phraseTextAndDimensionMap) documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations(unitVector, vectorPermutations, phraseTextAndDimensionMap) self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature) documentWithSignatureByVectors.setSignatureUsingVectors(permutatedUnitVectors, phraseTextAndDimensionMapWithMissingDimensions) documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations(unitVector, vectorPermutations, phraseTextAndDimensionMapWithMissingDimensions) self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature)
class RandomGaussianUnitVectorTests(unittest.TestCase): def setUp(self): self.vector = RandomGaussianUnitVector(dimensions=5, mu=0, sigma=1) self.permutation = VectorPermutation(dimensions=5) def test_initialization(self): self.assertEquals('%0.0f' % self.vector.mod(), '1') def test_getPermutedDimensionValue(self): self.assertEqual( self.vector[self.permutation.applyFunction(10)], self.vector.getPermutedDimensionValue(self.permutation, 10)) def test_getPermutedVector(self): permutedVector = self.vector.getPermutedVector(self.permutation) self.assertEqual(RandomGaussianUnitVector, type(permutedVector)) self.assertNotEqual(self.vector, permutedVector) self.assertEqual('1', '%0.0f' % permutedVector.mod()) def test_isPermutationSameAsVector(self): self.permutation.a = 1 self.permutation.b = 0 self.assertTrue(self.vector.isPermutationSameAsVector( self.permutation))
def test_setSignatureUsingVectorPermutations(self): dimensions, signatureLength = 53, 13 phraseTextAndDimensionMap = TwoWayMap() for i in range(dimensions): phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i) phraseTextAndDimensionMapWithMissingDimensions = TwoWayMap() for i in range(dimensions - 50): phraseTextAndDimensionMapWithMissingDimensions.set( TwoWayMap.MAP_FORWARD, i, i) unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations( signatureLength, dimensions, unitVector) permutatedUnitVectors = [ unitVector.getPermutedVector(r) for r in vectorPermutations ] documentVector = VectorGenerator.getRandomGaussianUnitVector( dimension=dimensions, mu=0, sigma=1) documentWithSignatureByVectors = Document(1, documentVector) documentWithSignatureByVectorPermutations = Document(2, documentVector) documentWithSignatureByVectors.setSignatureUsingVectors( permutatedUnitVectors, phraseTextAndDimensionMap) documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations( unitVector, vectorPermutations, phraseTextAndDimensionMap) self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature) documentWithSignatureByVectors.setSignatureUsingVectors( permutatedUnitVectors, phraseTextAndDimensionMapWithMissingDimensions) documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations( unitVector, vectorPermutations, phraseTextAndDimensionMapWithMissingDimensions) self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature)
def __init__(self, **clustering_settings): self.thresholdForDocumentToBeInACluster = clustering_settings[ 'threshold_for_document_to_be_in_cluster'] self.unitVector = RandomGaussianUnitVector( dimensions=clustering_settings['dimensions'], mu=0, sigma=1) self.vectorPermutations = VectorPermutation.getPermutations( clustering_settings['signature_length'], clustering_settings['dimensions'], self.unitVector) signatureType = clustering_settings.get('signature_type', 'signature_type_trie') if signatureType == 'signature_type_trie': self.signaturePermutations = [ SignaturePermutationWithTrie( clustering_settings['signature_length']) for i in range(clustering_settings['number_of_permutations']) ] else: self.signaturePermutations = [ SignaturePermutationWithSortedList( clustering_settings['signature_length']) for i in range(clustering_settings['number_of_permutations']) ] self.phraseTextAndDimensionMap, self.clusters = TwoWayMap(), {} self.clustering_settings = clustering_settings
def setUp(self): self.vector = RandomGaussianUnitVector(dimensions=5, mu=0, sigma=1) self.permutation = VectorPermutation(dimensions=5)
def offlineLSHClusteringDemo(): wordToDimensionMap = {} def createDocumentFromLine(docId, line): vector = Vector() words = line.split() for word in words[1:]: if word not in wordToDimensionMap: wordToDimensionMap[word] = len(wordToDimensionMap) wordDimension = wordToDimensionMap[word] if wordDimension not in vector: vector[wordDimension] = 1 else: vector[wordDimension] += 1 return Document(docId, vector, clusterId=words[0]) dimensions = 53 signatureLength = 13 numberOfPermutations = 5 unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations( signatureLength, dimensions, unitVector) signaturePermutations = [ SignaturePermutationWithTrie(signatureLength) for i in range(numberOfPermutations) ] permutatedUnitVectors = [ unitVector.getPermutedVector(r) for r in vectorPermutations ] # Build LSH Model. # Read training documents. traningDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/train_offline.dat')): traningDocumentsMap[docId] = createDocumentFromLine(docId, l) # Construct cluster vectors. clusterToDocumentsMap = defaultdict(list) for document in traningDocumentsMap.values(): clusterToDocumentsMap[document.clusterId].append(document) clusterMap = {} for k, v in clusterToDocumentsMap.iteritems(): clusterMap[k] = Document(docId=k, vector=Vector.getMeanVector(v), clusterId=k) # Create signatures and signaturePermutations for all the clusters. map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), clusterMap.values()) for permutation in signaturePermutations: for document in clusterMap.values(): permutation.addDocument(document) # Testing the model. # Read testing documents. testDocumentsMap = {} for docId, l in enumerate( FileIO.iterateLinesFromFile('../data/test_offline.dat')): testDocumentsMap[docId] = createDocumentFromLine(docId, l) # Create signatures for test documents map( lambda document: document.setSignatureUsingVectors( permutatedUnitVectors), testDocumentsMap.values()) predicted, labels = [], [] for t in testDocumentsMap.values(): possibleNearestClusters = reduce( lambda x, y: x.union(y), (permutation.getNearestDocuments(t) for permutation in signaturePermutations), set()) predictedClass = max( ((clusterId, clusterMap[clusterId].cosineSimilarity(t)) for clusterId in possibleNearestClusters), key=itemgetter(1)) predicted.append(predictedClass[0]) labels.append(t.clusterId) return EvaluationMetrics.purity(predicted, labels)