class CrowdTests(unittest.TestCase): def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time - timedelta(seconds=60)) self.m1.vector = Vector({1: 1., 2: 3.}) self.stream = Stream(1, self.m1) self.cluster = StreamCluster(self.stream) self.crowd = Crowd(self.cluster, test_time) def test_intitialization(self): self.assertEqual(self.cluster.clusterId, self.crowd.crowdId) def test_append(self): self.crowd.append(self.cluster, test_time + timedelta(days=1)) self.assertEqual([ GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time + timedelta(days=1)) ], sorted(self.crowd.clusters.keys())) self.assertEqual( StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject( test_time)])) self.assertEqual(2, self.crowd.lifespan) self.assertEqual( getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime)) self.assertEqual( getStringRepresentationForTweetTimestamp(test_time + timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime)) def test_maxClusterSize(self): self.assertEqual(1, self.crowd.maxClusterSize) message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message2.vector = Vector({2: 4}) stream2 = Stream(4, message2) self.cluster.addDocument(stream2) self.assertEqual(2, self.crowd.maxClusterSize) def test_crowdSize(self): self.assertEqual(1, self.crowd.crowdSize) self.cluster.addDocument(Stream(2, self.m1)) self.cluster.addDocument(Stream(3, self.m1)) self.assertEqual(3, self.crowd.crowdSize) cluster = StreamCluster(Stream(3, self.m1)) self.crowd.append(cluster, test_time + timedelta(days=2)) self.assertNotEqual(4, self.crowd.crowdSize) self.assertEqual(3, self.crowd.crowdSize)
class TwitterCrowdsSpecificMethodsTests(unittest.TestCase): def setUp(self): self.tweet = {'user':{'screen_name': 'abc'}, 'id':10, 'text':'A project to cluster high-dimensional streams.', 'created_at': 'Tue Mar 01 05:59:59 +0000 2011'} m1 = Message(1, '', '', datetime.now()) m1.vector=Vector({'#tcot':2,'dsf':4}) self.cluster1 = StreamCluster(Stream(1, m1)) m2 = Message(2, '', '', datetime.now()) m2.vector=Vector({'#tcot':4}) self.cluster2 = StreamCluster(Stream(2, m2)) m3 = Message(3, '', '', datetime.now()) m3.vector=Vector(Vector({'#tcot':2})) m4 = Message(4, '', '', datetime.now()) m4.vector=Vector(Vector({'#tcot':2})) self.doc1 = Stream(1, m3) self.doc2 = Stream(2, m4) self.meanVectorForAllDocuments = Vector.getMeanVector([self.cluster1, self.cluster2, self.doc1, self.doc2]) self.cluster1.addDocument(self.doc1) self.cluster2.addDocument(self.doc2) def test_convertTweetJSONToMessage(self): message = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(self.tweet, **twitter_stream_settings) self.assertEqual({'project': 1, 'cluster': 1, 'streams': 1, 'highdimensional': 1}, message.vector) def test_combineClusters(self): clustersMap = {self.cluster1.clusterId: self.cluster1, self.cluster2.clusterId: self.cluster2} clustersMap = TwitterCrowdsSpecificMethods.combineClusters(clustersMap, **twitter_stream_settings) self.assertEqual(1, len(clustersMap)) mergedCluster = clustersMap.values()[0] self.assertEqual([self.doc1, self.doc2], list(mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.meanVectorForAllDocuments, mergedCluster) self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual([self.cluster1.clusterId, self.cluster2.clusterId], mergedCluster.mergedClustersList) def test_getClusterInMapFormat(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) mergedCluster.mergedClustersList = [self.cluster1.clusterId] mergedCluster.lastStreamAddedTime = test_time mapReresentation = {'clusterId': mergedCluster.clusterId, 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(mergedCluster.lastStreamAddedTime), 'mergedClustersList': [self.cluster1.clusterId], 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}} self.assertEqual(mapReresentation, TwitterCrowdsSpecificMethods.getClusterInMapFormat(mergedCluster)) def test_getClusterFromMapFormat(self): mapReresentation = {'clusterId': 1, 'mergedClustersList': [self.cluster1.clusterId], 'lastStreamAddedTime': getStringRepresentationForTweetTimestamp(test_time), 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}} cluster = TwitterCrowdsSpecificMethods.getClusterFromMapFormat(mapReresentation) self.assertEqual(1, cluster.clusterId) self.assertEqual([self.cluster1.clusterId], cluster.mergedClustersList) self.assertEqual([self.doc1.docId], cluster.documentsInCluster) self.assertEqual({'#tcot':2, 'dsf':2}, cluster) self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(cluster.lastStreamAddedTime))
class CrowdTests(unittest.TestCase): def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60)) self.m1.vector=Vector({1:1.,2:3.}) self.stream = Stream(1, self.m1) self.cluster = StreamCluster(self.stream) self.crowd = Crowd(self.cluster, test_time) def test_intitialization(self): self.assertEqual(self.cluster.clusterId, self.crowd.crowdId) def test_append(self): self.crowd.append(self.cluster, test_time+timedelta(days=1)) self.assertEqual([GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time+timedelta(days=1))], sorted(self.crowd.clusters.keys())) self.assertEqual(StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(test_time)])) self.assertEqual(2, self.crowd.lifespan) self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime)) self.assertEqual(getStringRepresentationForTweetTimestamp(test_time+timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime)) def test_maxClusterSize(self): self.assertEqual(1, self.crowd.maxClusterSize) message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message2.vector=Vector({2:4}) stream2 = Stream(4, message2) self.cluster.addDocument(stream2) self.assertEqual(2, self.crowd.maxClusterSize) def test_crowdSize(self): self.assertEqual(1, self.crowd.crowdSize) self.cluster.addDocument(Stream(2, self.m1));self.cluster.addDocument(Stream(3, self.m1)) self.assertEqual(3, self.crowd.crowdSize) cluster = StreamCluster(Stream(3, self.m1)) self.crowd.append(cluster, test_time+timedelta(days=2)) self.assertNotEqual(4, self.crowd.crowdSize) self.assertEqual(3, self.crowd.crowdSize)
class StreamClusterTests(unittest.TestCase): def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60)) self.m1.vector=Vector({1:2,2:4}) self.stream1 = Stream(1, self.m1) self.m2 = Message(2, 'sdf', 'A project to cluster high-dimensional streams.', test_time) self.m2.vector=Vector({2:4}) self.stream2 = Stream(2, self.m2) self.m3 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time+timedelta(seconds=60)) self.m3.vector=Vector({2:4}) self.stream3 = Stream(3, self.m3) self.cluster1 = StreamCluster(self.stream1) self.cluster2 = StreamCluster(self.stream2) self.cluster3 = StreamCluster(self.stream3) def test_initialization(self): self.assertEqual(test_time-timedelta(seconds=60), self.cluster1.lastStreamAddedTime) self.assertEqual(test_time, self.cluster2.lastStreamAddedTime) def test_getClusterObjectToMergeFrom(self): documentsInCluster=list(self.cluster1.iterateDocumentsInCluster()) mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) self.assertEqual(test_time-timedelta(seconds=60), mergedCluster.lastStreamAddedTime) self.assertEqual(self.cluster1, mergedCluster) self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId) self.assertEqual(documentsInCluster, list(mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster1.aggregateVector, mergedCluster.aggregateVector) self.assertEqual(self.cluster1.vectorWeights, mergedCluster.vectorWeights) self.assertEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1) mergedCluster.mergeCluster(self.cluster2) self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster())) meanVectorForAllDocuments = Vector.getMeanVector([self.stream1, self.stream2]) self.assertEqual(meanVectorForAllDocuments, mergedCluster) self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster(self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster3) mergedCluster.mergeCluster(self.cluster1) self.assertTrue(self.cluster1.lastStreamAddedTime<self.cluster3.lastStreamAddedTime) self.assertEqual(self.cluster3.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) self.assertNotEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_addDocument(self): message1 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message1.vector=Vector({3:4}) stream1 = Stream(3, message1) message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message2.vector=Vector({2:4}) stream2 = Stream(4, message2) self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime) self.cluster1.addDocument(stream1) self.assertEqual(test_time, self.cluster1.lastStreamAddedTime) # Test if cluster id is set. self.assertEqual(self.cluster1.clusterId, stream1.clusterId) # Test that cluster mean is updated. self.assertEqual({1:2/2.,2:2.,3:2.}, self.cluster1) # Test that cluster aggrefate is updated. self.assertEqual({1:2,2:4,3:4}, self.cluster1.aggregateVector) # Test that document is added to cluster documents. self.assertEqual(stream1, self.cluster1.documentsInCluster[stream1.docId]) self.cluster1.addDocument(stream2) self.assertEqual(3, self.cluster1.vectorWeights) self.assertEqual({1:2/3.,2:8/3.,3:4/3.}, self.cluster1) self.assertEqual({1:2,2:8,3:4}, self.cluster1.aggregateVector) def test_clustersIteration(self): clusters = [self.cluster1, self.cluster2, self.cluster3] self.assertEqual([self.cluster1], [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time ,StreamCluster.BELOW_THRESHOLD)] ) self.assertEqual([self.cluster1, self.cluster2], [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time+timedelta(seconds=60) ,StreamCluster.BELOW_THRESHOLD)] )
class StreamClusterTests(unittest.TestCase): def setUp(self): self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time - timedelta(seconds=60)) self.m1.vector = Vector({1: 2, 2: 4}) self.stream1 = Stream(1, self.m1) self.m2 = Message(2, 'sdf', 'A project to cluster high-dimensional streams.', test_time) self.m2.vector = Vector({2: 4}) self.stream2 = Stream(2, self.m2) self.m3 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time + timedelta(seconds=60)) self.m3.vector = Vector({2: 4}) self.stream3 = Stream(3, self.m3) self.cluster1 = StreamCluster(self.stream1) self.cluster2 = StreamCluster(self.stream2) self.cluster3 = StreamCluster(self.stream3) def test_initialization(self): self.assertEqual(test_time - timedelta(seconds=60), self.cluster1.lastStreamAddedTime) self.assertEqual(test_time, self.cluster2.lastStreamAddedTime) def test_getClusterObjectToMergeFrom(self): documentsInCluster = list(self.cluster1.iterateDocumentsInCluster()) mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster1) self.assertEqual(test_time - timedelta(seconds=60), mergedCluster.lastStreamAddedTime) self.assertEqual(self.cluster1, mergedCluster) self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId) self.assertEqual(documentsInCluster, list(mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster1.aggregateVector, mergedCluster.aggregateVector) self.assertEqual(self.cluster1.vectorWeights, mergedCluster.vectorWeights) self.assertEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster( self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster1) mergedCluster.mergeCluster(self.cluster2) self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster())) meanVectorForAllDocuments = Vector.getMeanVector( [self.stream1, self.stream2]) self.assertEqual(meanVectorForAllDocuments, mergedCluster) self.assertEqual( [mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster())) self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster( self): mergedCluster = StreamCluster.getClusterObjectToMergeFrom( self.cluster3) mergedCluster.mergeCluster(self.cluster1) self.assertTrue(self.cluster1.lastStreamAddedTime < self.cluster3.lastStreamAddedTime) self.assertEqual(self.cluster3.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) self.assertNotEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime) def test_addDocument(self): message1 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message1.vector = Vector({3: 4}) stream1 = Stream(3, message1) message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time) message2.vector = Vector({2: 4}) stream2 = Stream(4, message2) self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime) self.cluster1.addDocument(stream1) self.assertEqual(test_time, self.cluster1.lastStreamAddedTime) # Test if cluster id is set. self.assertEqual(self.cluster1.clusterId, stream1.clusterId) # Test that cluster mean is updated. self.assertEqual({1: 2 / 2., 2: 2., 3: 2.}, self.cluster1) # Test that cluster aggrefate is updated. self.assertEqual({1: 2, 2: 4, 3: 4}, self.cluster1.aggregateVector) # Test that document is added to cluster documents. self.assertEqual(stream1, self.cluster1.documentsInCluster[stream1.docId]) self.cluster1.addDocument(stream2) self.assertEqual(3, self.cluster1.vectorWeights) self.assertEqual({1: 2 / 3., 2: 8 / 3., 3: 4 / 3.}, self.cluster1) self.assertEqual({1: 2, 2: 8, 3: 4}, self.cluster1.aggregateVector) def test_clustersIteration(self): clusters = [self.cluster1, self.cluster2, self.cluster3] self.assertEqual([self.cluster1], [ cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold( clusters, 'lastStreamAddedTime', test_time, StreamCluster.BELOW_THRESHOLD) ]) self.assertEqual([self.cluster1, self.cluster2], [ cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold( clusters, 'lastStreamAddedTime', test_time + timedelta(seconds=60), StreamCluster.BELOW_THRESHOLD) ])