Exemple #1
0
class CrowdTests(unittest.TestCase):
    def setUp(self):
        self.m1 = Message(1, 'sdf',
                          'A project to cluster high-dimensional streams.',
                          test_time - timedelta(seconds=60))
        self.m1.vector = Vector({1: 1., 2: 3.})
        self.stream = Stream(1, self.m1)
        self.cluster = StreamCluster(self.stream)
        self.crowd = Crowd(self.cluster, test_time)

    def test_intitialization(self):
        self.assertEqual(self.cluster.clusterId, self.crowd.crowdId)

    def test_append(self):
        self.crowd.append(self.cluster, test_time + timedelta(days=1))
        self.assertEqual([
            GeneralMethods.getEpochFromDateTimeObject(test_time),
            GeneralMethods.getEpochFromDateTimeObject(test_time +
                                                      timedelta(days=1))
        ], sorted(self.crowd.clusters.keys()))
        self.assertEqual(
            StreamCluster,
            type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(
                test_time)]))
        self.assertEqual(2, self.crowd.lifespan)
        self.assertEqual(
            getStringRepresentationForTweetTimestamp(test_time),
            getStringRepresentationForTweetTimestamp(self.crowd.startTime))
        self.assertEqual(
            getStringRepresentationForTweetTimestamp(test_time +
                                                     timedelta(days=1)),
            getStringRepresentationForTweetTimestamp(self.crowd.endTime))

    def test_maxClusterSize(self):
        self.assertEqual(1, self.crowd.maxClusterSize)
        message2 = Message(4, 'sdf',
                           'A project to cluster high-dimensional streams.',
                           test_time)
        message2.vector = Vector({2: 4})
        stream2 = Stream(4, message2)
        self.cluster.addDocument(stream2)
        self.assertEqual(2, self.crowd.maxClusterSize)

    def test_crowdSize(self):
        self.assertEqual(1, self.crowd.crowdSize)
        self.cluster.addDocument(Stream(2, self.m1))
        self.cluster.addDocument(Stream(3, self.m1))
        self.assertEqual(3, self.crowd.crowdSize)
        cluster = StreamCluster(Stream(3, self.m1))
        self.crowd.append(cluster, test_time + timedelta(days=2))
        self.assertNotEqual(4, self.crowd.crowdSize)
        self.assertEqual(3, self.crowd.crowdSize)
class TwitterCrowdsSpecificMethodsTests(unittest.TestCase):
    def setUp(self):
        self.tweet = {'user':{'screen_name': 'abc'}, 'id':10, 'text':'A project to cluster high-dimensional streams.', 'created_at': 'Tue Mar 01 05:59:59 +0000 2011'}
        m1 = Message(1, '', '', datetime.now())
        m1.vector=Vector({'#tcot':2,'dsf':4})
        self.cluster1 = StreamCluster(Stream(1, m1))
        m2 = Message(2, '', '', datetime.now())
        m2.vector=Vector({'#tcot':4})
        self.cluster2 = StreamCluster(Stream(2, m2))
        m3 = Message(3, '', '', datetime.now())
        m3.vector=Vector(Vector({'#tcot':2}))
        m4 = Message(4, '', '', datetime.now())
        m4.vector=Vector(Vector({'#tcot':2}))
        self.doc1 = Stream(1, m3)
        self.doc2 = Stream(2, m4)
        self.meanVectorForAllDocuments = Vector.getMeanVector([self.cluster1, self.cluster2, self.doc1, self.doc2])
        self.cluster1.addDocument(self.doc1)
        self.cluster2.addDocument(self.doc2)
    def test_convertTweetJSONToMessage(self):
        message = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(self.tweet, **twitter_stream_settings)
        self.assertEqual({'project': 1, 'cluster': 1, 'streams': 1, 'highdimensional': 1}, message.vector)
    def test_combineClusters(self):
        clustersMap = {self.cluster1.clusterId: self.cluster1, self.cluster2.clusterId: self.cluster2}
        clustersMap = TwitterCrowdsSpecificMethods.combineClusters(clustersMap, **twitter_stream_settings)
        self.assertEqual(1, len(clustersMap))
        mergedCluster = clustersMap.values()[0]
        self.assertEqual([self.doc1, self.doc2], list(mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual(self.meanVectorForAllDocuments, mergedCluster)
        self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual([self.cluster1.clusterId, self.cluster2.clusterId], mergedCluster.mergedClustersList)
    def test_getClusterInMapFormat(self):
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
        mergedCluster.mergedClustersList = [self.cluster1.clusterId]
        mergedCluster.lastStreamAddedTime = test_time
        mapReresentation = {'clusterId': mergedCluster.clusterId, 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(mergedCluster.lastStreamAddedTime), 'mergedClustersList': [self.cluster1.clusterId], 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}}
        self.assertEqual(mapReresentation, TwitterCrowdsSpecificMethods.getClusterInMapFormat(mergedCluster))
    def test_getClusterFromMapFormat(self):
        mapReresentation = {'clusterId': 1, 'mergedClustersList': [self.cluster1.clusterId], 'lastStreamAddedTime': getStringRepresentationForTweetTimestamp(test_time), 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}}
        cluster = TwitterCrowdsSpecificMethods.getClusterFromMapFormat(mapReresentation)
        self.assertEqual(1, cluster.clusterId)
        self.assertEqual([self.cluster1.clusterId], cluster.mergedClustersList)
        self.assertEqual([self.doc1.docId], cluster.documentsInCluster)
        self.assertEqual({'#tcot':2, 'dsf':2}, cluster)
        self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(cluster.lastStreamAddedTime))
class CrowdTests(unittest.TestCase):
    def setUp(self):
        self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60))
        self.m1.vector=Vector({1:1.,2:3.})
        self.stream = Stream(1, self.m1)
        self.cluster = StreamCluster(self.stream)
        self.crowd = Crowd(self.cluster, test_time)
    def test_intitialization(self):
        self.assertEqual(self.cluster.clusterId, self.crowd.crowdId)
    def test_append(self):
        self.crowd.append(self.cluster, test_time+timedelta(days=1))
        self.assertEqual([GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time+timedelta(days=1))], sorted(self.crowd.clusters.keys()))
        self.assertEqual(StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(test_time)]))
        self.assertEqual(2, self.crowd.lifespan)
        self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime))
        self.assertEqual(getStringRepresentationForTweetTimestamp(test_time+timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime))
    def test_maxClusterSize(self):
        self.assertEqual(1, self.crowd.maxClusterSize)
        message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time)
        message2.vector=Vector({2:4})
        stream2 = Stream(4, message2)
        self.cluster.addDocument(stream2)
        self.assertEqual(2, self.crowd.maxClusterSize)
    def test_crowdSize(self):
        self.assertEqual(1, self.crowd.crowdSize)
        self.cluster.addDocument(Stream(2, self.m1));self.cluster.addDocument(Stream(3, self.m1))
        self.assertEqual(3, self.crowd.crowdSize)
        cluster = StreamCluster(Stream(3, self.m1))
        self.crowd.append(cluster, test_time+timedelta(days=2))
        self.assertNotEqual(4, self.crowd.crowdSize)
        self.assertEqual(3, self.crowd.crowdSize)
class StreamClusterTests(unittest.TestCase):
    def setUp(self): 
        self.m1 = Message(1, 'sdf', 'A project to cluster high-dimensional streams.', test_time-timedelta(seconds=60))
        self.m1.vector=Vector({1:2,2:4})
        self.stream1 = Stream(1, self.m1)
        self.m2 = Message(2, 'sdf', 'A project to cluster high-dimensional streams.', test_time)
        self.m2.vector=Vector({2:4})
        self.stream2 = Stream(2, self.m2)
        self.m3 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time+timedelta(seconds=60))
        self.m3.vector=Vector({2:4})
        self.stream3 = Stream(3, self.m3)
        self.cluster1 = StreamCluster(self.stream1)
        self.cluster2 = StreamCluster(self.stream2)
        self.cluster3 = StreamCluster(self.stream3)
    def test_initialization(self):
        self.assertEqual(test_time-timedelta(seconds=60), self.cluster1.lastStreamAddedTime)
        self.assertEqual(test_time, self.cluster2.lastStreamAddedTime)
    def test_getClusterObjectToMergeFrom(self):
        documentsInCluster=list(self.cluster1.iterateDocumentsInCluster())
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
        self.assertEqual(test_time-timedelta(seconds=60), mergedCluster.lastStreamAddedTime)
        self.assertEqual(self.cluster1, mergedCluster)
        self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId)
        self.assertEqual(documentsInCluster, list(mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual(self.cluster1.aggregateVector, mergedCluster.aggregateVector)
        self.assertEqual(self.cluster1.vectorWeights, mergedCluster.vectorWeights)
        self.assertEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
    def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(self):
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
        mergedCluster.mergeCluster(self.cluster2)
        self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster()))
        meanVectorForAllDocuments = Vector.getMeanVector([self.stream1, self.stream2])
        self.assertEqual(meanVectorForAllDocuments, mergedCluster)
        self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
    def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster(self):
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster3)
        mergedCluster.mergeCluster(self.cluster1)
        self.assertTrue(self.cluster1.lastStreamAddedTime<self.cluster3.lastStreamAddedTime)
        self.assertEqual(self.cluster3.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
        self.assertNotEqual(self.cluster1.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
    def test_addDocument(self):
        message1 = Message(3, 'sdf', 'A project to cluster high-dimensional streams.', test_time)
        message1.vector=Vector({3:4})
        stream1 = Stream(3, message1)
        message2 = Message(4, 'sdf', 'A project to cluster high-dimensional streams.', test_time)
        message2.vector=Vector({2:4})
        stream2 = Stream(4, message2)
        self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime)
        self.cluster1.addDocument(stream1)
        self.assertEqual(test_time, self.cluster1.lastStreamAddedTime)
        # Test if cluster id is set.
        self.assertEqual(self.cluster1.clusterId, stream1.clusterId)
        # Test that cluster mean is updated.
        self.assertEqual({1:2/2.,2:2.,3:2.}, self.cluster1)
        # Test that cluster aggrefate is updated.
        self.assertEqual({1:2,2:4,3:4}, self.cluster1.aggregateVector)
        # Test that document is added to cluster documents.
        self.assertEqual(stream1, self.cluster1.documentsInCluster[stream1.docId])
        self.cluster1.addDocument(stream2)
        self.assertEqual(3, self.cluster1.vectorWeights)
        self.assertEqual({1:2/3.,2:8/3.,3:4/3.}, self.cluster1)
        self.assertEqual({1:2,2:8,3:4}, self.cluster1.aggregateVector)
    def test_clustersIteration(self):
        clusters = [self.cluster1, self.cluster2, self.cluster3]
        self.assertEqual([self.cluster1],
                          [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time ,StreamCluster.BELOW_THRESHOLD)]
                        )
        self.assertEqual([self.cluster1, self.cluster2],
                          [cluster for cluster in StreamCluster.getClustersByAttributeAndThreshold(clusters, 'lastStreamAddedTime', test_time+timedelta(seconds=60) ,StreamCluster.BELOW_THRESHOLD)]
                        )
Exemple #5
0
class StreamClusterTests(unittest.TestCase):
    def setUp(self):
        self.m1 = Message(1, 'sdf',
                          'A project to cluster high-dimensional streams.',
                          test_time - timedelta(seconds=60))
        self.m1.vector = Vector({1: 2, 2: 4})
        self.stream1 = Stream(1, self.m1)
        self.m2 = Message(2, 'sdf',
                          'A project to cluster high-dimensional streams.',
                          test_time)
        self.m2.vector = Vector({2: 4})
        self.stream2 = Stream(2, self.m2)
        self.m3 = Message(3, 'sdf',
                          'A project to cluster high-dimensional streams.',
                          test_time + timedelta(seconds=60))
        self.m3.vector = Vector({2: 4})
        self.stream3 = Stream(3, self.m3)
        self.cluster1 = StreamCluster(self.stream1)
        self.cluster2 = StreamCluster(self.stream2)
        self.cluster3 = StreamCluster(self.stream3)

    def test_initialization(self):
        self.assertEqual(test_time - timedelta(seconds=60),
                         self.cluster1.lastStreamAddedTime)
        self.assertEqual(test_time, self.cluster2.lastStreamAddedTime)

    def test_getClusterObjectToMergeFrom(self):
        documentsInCluster = list(self.cluster1.iterateDocumentsInCluster())
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
            self.cluster1)
        self.assertEqual(test_time - timedelta(seconds=60),
                         mergedCluster.lastStreamAddedTime)
        self.assertEqual(self.cluster1, mergedCluster)
        self.assertNotEqual(self.cluster1.clusterId, mergedCluster.clusterId)
        self.assertEqual(documentsInCluster,
                         list(mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual(self.cluster1.aggregateVector,
                         mergedCluster.aggregateVector)
        self.assertEqual(self.cluster1.vectorWeights,
                         mergedCluster.vectorWeights)
        self.assertEqual(self.cluster1.lastStreamAddedTime,
                         mergedCluster.lastStreamAddedTime)

    def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(
            self):
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
            self.cluster1)
        mergedCluster.mergeCluster(self.cluster2)
        self.assertEqual([self.stream1, self.stream2],
                         list(mergedCluster.iterateDocumentsInCluster()))
        meanVectorForAllDocuments = Vector.getMeanVector(
            [self.stream1, self.stream2])
        self.assertEqual(meanVectorForAllDocuments, mergedCluster)
        self.assertEqual(
            [mergedCluster.docId, mergedCluster.docId],
            list(doc.clusterId
                 for doc in mergedCluster.iterateDocumentsInCluster()))
        self.assertEqual(self.cluster2.lastStreamAddedTime,
                         mergedCluster.lastStreamAddedTime)

    def test_mergeCluster_lastStreamAddedTime_lesser_than_original_cluster(
            self):
        mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
            self.cluster3)
        mergedCluster.mergeCluster(self.cluster1)
        self.assertTrue(self.cluster1.lastStreamAddedTime <
                        self.cluster3.lastStreamAddedTime)
        self.assertEqual(self.cluster3.lastStreamAddedTime,
                         mergedCluster.lastStreamAddedTime)
        self.assertNotEqual(self.cluster1.lastStreamAddedTime,
                            mergedCluster.lastStreamAddedTime)

    def test_addDocument(self):
        message1 = Message(3, 'sdf',
                           'A project to cluster high-dimensional streams.',
                           test_time)
        message1.vector = Vector({3: 4})
        stream1 = Stream(3, message1)
        message2 = Message(4, 'sdf',
                           'A project to cluster high-dimensional streams.',
                           test_time)
        message2.vector = Vector({2: 4})
        stream2 = Stream(4, message2)
        self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime)
        self.cluster1.addDocument(stream1)
        self.assertEqual(test_time, self.cluster1.lastStreamAddedTime)
        # Test if cluster id is set.
        self.assertEqual(self.cluster1.clusterId, stream1.clusterId)
        # Test that cluster mean is updated.
        self.assertEqual({1: 2 / 2., 2: 2., 3: 2.}, self.cluster1)
        # Test that cluster aggrefate is updated.
        self.assertEqual({1: 2, 2: 4, 3: 4}, self.cluster1.aggregateVector)
        # Test that document is added to cluster documents.
        self.assertEqual(stream1,
                         self.cluster1.documentsInCluster[stream1.docId])
        self.cluster1.addDocument(stream2)
        self.assertEqual(3, self.cluster1.vectorWeights)
        self.assertEqual({1: 2 / 3., 2: 8 / 3., 3: 4 / 3.}, self.cluster1)
        self.assertEqual({1: 2, 2: 8, 3: 4}, self.cluster1.aggregateVector)

    def test_clustersIteration(self):
        clusters = [self.cluster1, self.cluster2, self.cluster3]
        self.assertEqual([self.cluster1], [
            cluster
            for cluster in StreamCluster.getClustersByAttributeAndThreshold(
                clusters, 'lastStreamAddedTime', test_time,
                StreamCluster.BELOW_THRESHOLD)
        ])
        self.assertEqual([self.cluster1, self.cluster2], [
            cluster
            for cluster in StreamCluster.getClustersByAttributeAndThreshold(
                clusters, 'lastStreamAddedTime', test_time +
                timedelta(seconds=60), StreamCluster.BELOW_THRESHOLD)
        ])