def setUp(self): self.dimension, self.signatureLength = 50, 23 self.phraseTextAndDimensionMap = TwoWayMap() for i in range(self.dimension): self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i) self.unitRandomVectors = [ VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1) for i in range(self.signatureLength) ] self.doc1 = Document( 1, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) self.doc2 = Document( 2, VectorGenerator.getRandomGaussianUnitVector( dimension=self.dimension, mu=0, sigma=1)) self.doc1.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.doc2.setSignatureUsingVectors(self.unitRandomVectors, self.phraseTextAndDimensionMap) self.pm = SignaturePermutationWithTrie( signatureLength=self.signatureLength) self.pm.addDocument(self.doc1) self.pm.addDocument(self.doc2)
def setUp(self): self.phraseVector = { 'project': 1, 'cluster': 1, 'highdimensional': 1, 'streams': 1 } self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.phraseTextToPhraseObjectMap = { 'project': Phrase('project', test_time, score=8), 'cluster': Phrase('cluster', test_time, score=8), 'abcd': Phrase( 'abcd', test_time - 3 * stream_settings['max_phrase_inactivity_time_in_seconds'], score=8) } self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1}) self.initial_max_dimensions = stream_settings['dimensions'] stream_settings['dimensions'] = 2
def __init__(self, **settings): self.settings = settings self.nearestNeighborThreshold = settings['nearest_neighbor_threshold'] self.unitVector = RandomGaussianUnitVector( dimensions=settings['dimensions'], mu=0, sigma=1) self.vectorPermutations = VectorPermutation.getPermutations( settings['signature_length'], settings['dimensions'], self.unitVector) # self.signaturePermutations = [SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations'])] signatureType = settings.get('signature_type', 'signature_type_trie') if signatureType == 'signature_type_trie': self.signaturePermutations = [ SignaturePermutationWithTrie(settings['signature_length']) for i in range(settings['number_of_permutations']) ] else: self.signaturePermutations = [ SignaturePermutationWithSortedList( settings['signature_length']) for i in range(settings['number_of_permutations']) ] self.phraseTextAndDimensionMap = TwoWayMap() self.documentIdToDocumentMap = {}
def test_setSignatureUsingVectorPermutations(self): dimensions, signatureLength = 53, 13 phraseTextAndDimensionMap = TwoWayMap() for i in range(dimensions): phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, i, i) phraseTextAndDimensionMapWithMissingDimensions = TwoWayMap() for i in range(dimensions - 50): phraseTextAndDimensionMapWithMissingDimensions.set( TwoWayMap.MAP_FORWARD, i, i) unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1) vectorPermutations = VectorPermutation.getPermutations( signatureLength, dimensions, unitVector) permutatedUnitVectors = [ unitVector.getPermutedVector(r) for r in vectorPermutations ] documentVector = VectorGenerator.getRandomGaussianUnitVector( dimension=dimensions, mu=0, sigma=1) documentWithSignatureByVectors = Document(1, documentVector) documentWithSignatureByVectorPermutations = Document(2, documentVector) documentWithSignatureByVectors.setSignatureUsingVectors( permutatedUnitVectors, phraseTextAndDimensionMap) documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations( unitVector, vectorPermutations, phraseTextAndDimensionMap) self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature) documentWithSignatureByVectors.setSignatureUsingVectors( permutatedUnitVectors, phraseTextAndDimensionMapWithMissingDimensions) documentWithSignatureByVectorPermutations.setSignatureUsingVectorPermutations( unitVector, vectorPermutations, phraseTextAndDimensionMapWithMissingDimensions) self.assertEqual(documentWithSignatureByVectors.signature, documentWithSignatureByVectorPermutations.signature)
def setUp(self): self.phraseVector = { 'project': 1, 'cluster': 1, 'highdimensional': 1, 'streams': 1 } self.phraseTextAndDimensionMap = TwoWayMap() self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0) self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1) self.finalPhraseToIdMap = { 'project': 0, 'cluster': 1, 'streams': 2, 'highdimensional': 3 } settings['dimensions'] = 2
def __init__(self, **clustering_settings): self.thresholdForDocumentToBeInACluster = clustering_settings[ 'threshold_for_document_to_be_in_cluster'] self.unitVector = RandomGaussianUnitVector( dimensions=clustering_settings['dimensions'], mu=0, sigma=1) self.vectorPermutations = VectorPermutation.getPermutations( clustering_settings['signature_length'], clustering_settings['dimensions'], self.unitVector) signatureType = clustering_settings.get('signature_type', 'signature_type_trie') if signatureType == 'signature_type_trie': self.signaturePermutations = [ SignaturePermutationWithTrie( clustering_settings['signature_length']) for i in range(clustering_settings['number_of_permutations']) ] else: self.signaturePermutations = [ SignaturePermutationWithSortedList( clustering_settings['signature_length']) for i in range(clustering_settings['number_of_permutations']) ] self.phraseTextAndDimensionMap, self.clusters = TwoWayMap(), {} self.clustering_settings = clustering_settings
def test_setSignatureUsingVectors(self): phraseTextAndDimensionMap = TwoWayMap() phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'a', 1) phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'b', 2) documentWithDimensionsInVector = Document(1, {'a': 1, 'b': 4}) documentWithDimensionsNotInVector = Document(1, {'a': 1, 'c': 4}) vectors = [ Vector({ 1: 3 / 5., 2: -4 / 5. }), Vector({ 1: -5 / 13., 2: 12 / 13. }) ] documentWithDimensionsInVector.setSignatureUsingVectors( vectors, phraseTextAndDimensionMap) documentWithDimensionsNotInVector.setSignatureUsingVectors( vectors, phraseTextAndDimensionMap) self.assertEqual(Signature('01'), documentWithDimensionsInVector.signature) self.assertEqual(Signature('10'), documentWithDimensionsNotInVector.signature)