コード例 #1
0
 def modifiedClusterAnalysisMethod(hdStreamClusteringObject,
                                   currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in hdStreamClusteringObject.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >=
         experts_twitter_stream_settings['cluster_filter_threshold']
     ]
     iteration_data = evaluation.getEvaluationMetrics(
         documentClusters, currentTime - previousTime, {
             'type':
             experts_twitter_stream_settings['lsh_type'],
             'total_clusters':
             len(hdStreamClusteringObject.clusters),
             'current_time':
             getStringRepresentationForTweetTimestamp(currentMessageTime)
         })
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data,
                              JustifyNotUsingVanillaLSH.stats_file)
     del iteration_data['clusters']
     print getStringRepresentationForTweetTimestamp(
         currentMessageTime), iteration_data
コード例 #2
0
 def test_append(self):
     self.crowd.append(self.cluster, test_time+timedelta(days=1))
     self.assertEqual([GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time+timedelta(days=1))], sorted(self.crowd.clusters.keys()))
     self.assertEqual(StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(test_time)]))
     self.assertEqual(2, self.crowd.lifespan)
     self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime))
     self.assertEqual(getStringRepresentationForTweetTimestamp(test_time+timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime))
 def test_getClusterFromMapFormat(self):
     mapReresentation = {'clusterId': 1, 'mergedClustersList': [self.cluster1.clusterId], 'lastStreamAddedTime': getStringRepresentationForTweetTimestamp(test_time), 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}}
     cluster = TwitterCrowdsSpecificMethods.getClusterFromMapFormat(mapReresentation)
     self.assertEqual(1, cluster.clusterId)
     self.assertEqual([self.cluster1.clusterId], cluster.mergedClustersList)
     self.assertEqual([self.doc1.docId], cluster.documentsInCluster)
     self.assertEqual({'#tcot':2, 'dsf':2}, cluster)
     self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(cluster.lastStreamAddedTime))
コード例 #4
0
 def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [cluster.documentsInCluster.keys() for k, cluster in hdStreamClusteringObject.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=experts_twitter_stream_settings['cluster_filter_threshold']]
     iteration_data = evaluation.getEvaluationMetrics(documentClusters, currentTime-previousTime, {'type': experts_twitter_stream_settings['lsh_type'], 'total_clusters': len(hdStreamClusteringObject.clusters), 'current_time': getStringRepresentationForTweetTimestamp(currentMessageTime)})
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyNotUsingVanillaLSH.stats_file)
     del iteration_data['clusters']
     print getStringRepresentationForTweetTimestamp(currentMessageTime), iteration_data
コード例 #5
0
 def clusterLagDistributionMethod(hdStreamClusteringObject,
                                  currentMessageTime):
     lagDistribution = defaultdict(int)
     for cluster in hdStreamClusteringObject.clusters.values():
         lag = DateTimeAirthematic.getDifferenceInTimeUnits(
             currentMessageTime, cluster.lastStreamAddedTime,
             hdStreamClusteringObject.
             stream_settings['time_unit_in_seconds'].seconds)
         lagDistribution[str(lag)] += 1
     print currentMessageTime, len(hdStreamClusteringObject.clusters)
     iterationData = {
         'time_stamp':
         getStringRepresentationForTweetTimestamp(currentMessageTime),
         'settings':
         pprint.pformat(hdStreamClusteringObject.stream_settings),
         ClusteringParametersEstimation.clusterLagDistributionId:
         lagDistribution,
         'lag_between_streams_added_to_cluster':
         hdStreamClusteringObject.
         stream_settings['lag_between_streams_added_to_cluster']
     }
     #        print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
     FileIO.writeToFileAsJson(
         iterationData, hdStreamClusteringObject.stream_settings[
             '%s_file' %
             ClusteringParametersEstimation.clusterLagDistributionId])
コード例 #6
0
 def dimensionsEstimation(estimationObject, currentMessageTime):
     '''
     This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
     the number of phrases that need to added every iteration for different dimensions.
     The dimension at which the number of phrases added stablizes is the number of dimensions
     for the stream.
     
     Why do we need this?
     The aim is to get dimensions, that dont change too often at the same time are not very huge.
     This experiments gives us an approximate idea of the number of dimensions. Randomly picking 
     a small value will result in dimensions that are not good and picking too big a value will 
     result in inefficiency.  
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)]
     oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
     if estimationObject.topDimensionsDuringPreviousIteration:
         dimensions_estimation = {}
         for boundary in estimationObject.boundaries:
             if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary]))
         print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap)
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': estimationObject.stream_settings.convertToSerializableObject(),
                          ParameterEstimation.dimensionsEstimationId:dimensions_estimation
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile)
     estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
コード例 #7
0
 def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
     '''
     Observe the new dimensions that get added to current dimension if the dimensions 
     are being updated at regular intervals.
     For example, number of dimensions being added after 10m, 20m,... 5 horus. 
     As time increases the number of 'decayed' dimensions increase. The current dimensions
     has a lot of unwanted decayed dimensions. Using this information identify the time 
     interval that is best suited to refresh dimensions. 
     Tentative: We decide to pick the time interval at which the rate of decay is maximum.
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     dimensions = estimationObject.stream_settings['dimensions']
     newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
     print currentMessageTime, len(newList)
     if len(newList) >= dimensions:
         idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
         dimensionsUpdateFrequency = {}
         for td, id in idsOfDimensionsListToCompare:
             oldList = estimationObject.dimensionListsMap[id]
             dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
         print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': pprint.pformat(estimationObject.stream_settings),
                           ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
         estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
         for key in estimationObject.dimensionListsMap.keys()[:]:
             if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
コード例 #8
0
 def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
     '''
     Observe the new dimensions that get added to current dimension if the dimensions 
     are being updated at regular intervals.
     For example, number of dimensions being added after 10m, 20m,... 5 horus. 
     As time increases the number of 'decayed' dimensions increase. The current dimensions
     has a lot of unwanted decayed dimensions. Using this information identify the time 
     interval that is best suited to refresh dimensions. 
     Tentative: We decide to pick the time interval at which the rate of decay is maximum.
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     dimensions = estimationObject.stream_settings['dimensions']
     newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
     print currentMessageTime, len(newList)
     if len(newList) >= dimensions:
         idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
         dimensionsUpdateFrequency = {}
         for td, id in idsOfDimensionsListToCompare:
             oldList = estimationObject.dimensionListsMap[id]
             dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
         print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': pprint.pformat(estimationObject.stream_settings),
                           ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
         estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
         for key in estimationObject.dimensionListsMap.keys()[:]:
             if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
コード例 #9
0
 def dimensionsEstimation(estimationObject, currentMessageTime):
     '''
     This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
     the number of phrases that need to added every iteration for different dimensions.
     The dimension at which the number of phrases added stablizes is the number of dimensions
     for the stream.
     
     Why do we need this?
     The aim is to get dimensions, that dont change too often at the same time are not very huge.
     This experiments gives us an approximate idea of the number of dimensions. Randomly picking 
     a small value will result in dimensions that are not good and picking too big a value will 
     result in inefficiency.  
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)]
     oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
     if estimationObject.topDimensionsDuringPreviousIteration:
         dimensions_estimation = {}
         for boundary in estimationObject.boundaries:
             if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary]))
         print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap)
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': estimationObject.stream_settings.convertToSerializableObject(),
                          ParameterEstimation.dimensionsEstimationId:dimensions_estimation
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile)
     estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
 def writeTweetsForDay(currentDay):
     fileName = houston_data_folder+FileIO.getFileByDay(currentDay)
     for tweet in tweets.find({'ca': {'$gt':currentDay, '$lt': currentDay+timedelta(seconds=86399)}}, fields=['ca', 'tx', 'uid']):
         screenName = GenerateHoustonTweetsData.getScreenName(tweet['uid'])
         if screenName!=None: 
             data = {'id': tweet['_id'], 'text': tweet['tx'], 'created_at':getStringRepresentationForTweetTimestamp(tweet['ca']), 'user':{'screen_name': GenerateHoustonTweetsData.getScreenName(tweet['uid'])}}
             FileIO.writeToFileAsJson(data, fileName) 
     os.system('gzip %s'%fileName)
 def writeClusters(hdStreamClusteringObject, currentMessageTime):
     print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
     iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]),
                      'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings)
                      }
     FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime))
     print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
コード例 #12
0
 def test_append(self):
     self.crowd.append(self.cluster, test_time + timedelta(days=1))
     self.assertEqual([
         GeneralMethods.getEpochFromDateTimeObject(test_time),
         GeneralMethods.getEpochFromDateTimeObject(test_time +
                                                   timedelta(days=1))
     ], sorted(self.crowd.clusters.keys()))
     self.assertEqual(
         StreamCluster,
         type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(
             test_time)]))
     self.assertEqual(2, self.crowd.lifespan)
     self.assertEqual(
         getStringRepresentationForTweetTimestamp(test_time),
         getStringRepresentationForTweetTimestamp(self.crowd.startTime))
     self.assertEqual(
         getStringRepresentationForTweetTimestamp(test_time +
                                                  timedelta(days=1)),
         getStringRepresentationForTweetTimestamp(self.crowd.endTime))
コード例 #13
0
 def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in hdStreamClusteringObject.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >= experts_twitter_stream_settings["cluster_filter_threshold"]
     ]
     iteration_data = evaluation.getEvaluationMetrics(
         documentClusters,
         currentTime - previousTime,
         {
             "type": experts_twitter_stream_settings["trie_type"],
             "total_clusters": len(hdStreamClusteringObject.clusters),
             "current_time": getStringRepresentationForTweetTimestamp(currentMessageTime),
         },
     )
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyTrie.stats_file)
     del iteration_data["clusters"]
     print getStringRepresentationForTweetTimestamp(currentMessageTime), iteration_data
コード例 #14
0
 def dimensionInActivityTimeEstimation(estimationObject, currentMessageTime):
     phrasesLagDistribution = defaultdict(int)
     for phraseObject in estimationObject.phraseTextToPhraseObjectMap.itervalues():
         lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, phraseObject.latestOccuranceTime, estimationObject.stream_settings['time_unit_in_seconds'].seconds)
         phrasesLagDistribution[str(lag)] += 1
     print currentMessageTime
     iterationData = {
                      'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'settings': pprint.pformat(estimationObject.stream_settings),
                      ParameterEstimation.dimensionInActivityTimeId:estimationObject.lagBetweenMessagesDistribution,
                      'phrases_lag_distribution': phrasesLagDistribution
                      }
     FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionInActivityTimeFile)
コード例 #15
0
 def dimensionInActivityTimeEstimation(estimationObject, currentMessageTime):
     phrasesLagDistribution = defaultdict(int)
     for phraseObject in estimationObject.phraseTextToPhraseObjectMap.itervalues():
         lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, phraseObject.latestOccuranceTime, estimationObject.stream_settings['time_unit_in_seconds'].seconds)
         phrasesLagDistribution[str(lag)] += 1
     print currentMessageTime
     iterationData = {
                      'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'settings': pprint.pformat(estimationObject.stream_settings),
                      ParameterEstimation.dimensionInActivityTimeId:estimationObject.lagBetweenMessagesDistribution,
                      'phrases_lag_distribution': phrasesLagDistribution
                      }
     FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionInActivityTimeFile)
コード例 #16
0
    def clusterLagDistributionMethod(hdStreamClusteringObject, currentMessageTime):
        lagDistribution = defaultdict(int)
        for cluster in hdStreamClusteringObject.clusters.values():
            lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, cluster.lastStreamAddedTime, hdStreamClusteringObject.stream_settings['time_unit_in_seconds'].seconds)
            lagDistribution[str(lag)] += 1
        print currentMessageTime, len(hdStreamClusteringObject.clusters)
        iterationData = {
                         'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                         'settings': pprint.pformat(hdStreamClusteringObject.stream_settings),
                         ClusteringParametersEstimation.clusterLagDistributionId: lagDistribution,
                         'lag_between_streams_added_to_cluster': hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
                         }
#        print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
        FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])
コード例 #17
0
 def getClusterInMapFormat(cluster, numberOfMaxDimensionsToRepresent=20):
     return {
         'clusterId':
         cluster.clusterId,
         'mergedClustersList':
         cluster.mergedClustersList,
         'lastStreamAddedTime':
         getStringRepresentationForTweetTimestamp(
             cluster.lastStreamAddedTime),
         'streams':
         [stream.docId for stream in cluster.iterateDocumentsInCluster()],
         'dimensions':
         cluster.getTopDimensions(
             numberOfFeatures=numberOfMaxDimensionsToRepresent)
     }
コード例 #18
0
 def setUp(self):
     AnalyzeData.crowdMap, AnalyzeData.clusterIdToCrowdIdMap, AnalyzeData.crowdIdToClusterIdMap = {}, {}, {}
     self.clusterMaps = {
                    test_time: [
                         {'clusterId': 'cluster_4', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_1'], 'streams': [], 'dimensions': {}},
                         {'clusterId': 'cluster_5', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_2'], 'streams': [], 'dimensions': {}},
                         {'clusterId': 'cluster_6', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_3'], 'streams': [], 'dimensions': {}},
                         ],
                    test_time+timedelta(seconds=30*60): [
                         {'clusterId': 'cluster_7', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_4'], 'streams': [], 'dimensions': {}},
                         {'clusterId': 'cluster_8', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_5','cluster_6'], 'streams': [], 'dimensions': {}},
                         ],
                     test_time+2*timedelta(seconds=30*60): [
                         {'clusterId': 'cluster_9', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_7'], 'streams': [], 'dimensions': {}},
                         {'clusterId': 'cluster_10', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_8'], 'streams': [], 'dimensions': {}},
                         ]
                    }
     AnalyzeData.constructCrowdDataStructures(self.dataIterator)
コード例 #19
0
 def getClusterInMapFormat(cluster, numberOfMaxDimensionsToRepresent=20): 
     return {'clusterId': cluster.clusterId, 'mergedClustersList': cluster.mergedClustersList, 'lastStreamAddedTime': getStringRepresentationForTweetTimestamp(cluster.lastStreamAddedTime),
            'streams': [stream.docId for stream in cluster.iterateDocumentsInCluster()],
            'dimensions': cluster.getTopDimensions(numberOfFeatures=numberOfMaxDimensionsToRepresent)}
コード例 #20
0
 def setUp(self):
     AnalyzeData.crowdMap, AnalyzeData.clusterIdToCrowdIdMap, AnalyzeData.crowdIdToClusterIdMap = {}, {}, {}
     self.clusterMaps = {
         test_time: [
             {
                 'clusterId':
                 'cluster_4',
                 'lastStreamAddedTime':
                 getStringRepresentationForTweetTimestamp(test_time),
                 'mergedClustersList': ['cluster_1'],
                 'streams': [],
                 'dimensions': {}
             },
             {
                 'clusterId':
                 'cluster_5',
                 'lastStreamAddedTime':
                 getStringRepresentationForTweetTimestamp(test_time),
                 'mergedClustersList': ['cluster_2'],
                 'streams': [],
                 'dimensions': {}
             },
             {
                 'clusterId':
                 'cluster_6',
                 'lastStreamAddedTime':
                 getStringRepresentationForTweetTimestamp(test_time),
                 'mergedClustersList': ['cluster_3'],
                 'streams': [],
                 'dimensions': {}
             },
         ],
         test_time + timedelta(seconds=30 * 60): [
             {
                 'clusterId':
                 'cluster_7',
                 'lastStreamAddedTime':
                 getStringRepresentationForTweetTimestamp(test_time),
                 'mergedClustersList': ['cluster_4'],
                 'streams': [],
                 'dimensions': {}
             },
             {
                 'clusterId':
                 'cluster_8',
                 'lastStreamAddedTime':
                 getStringRepresentationForTweetTimestamp(test_time),
                 'mergedClustersList': ['cluster_5', 'cluster_6'],
                 'streams': [],
                 'dimensions': {}
             },
         ],
         test_time + 2 * timedelta(seconds=30 * 60): [
             {
                 'clusterId':
                 'cluster_9',
                 'lastStreamAddedTime':
                 getStringRepresentationForTweetTimestamp(test_time),
                 'mergedClustersList': ['cluster_7'],
                 'streams': [],
                 'dimensions': {}
             },
             {
                 'clusterId':
                 'cluster_10',
                 'lastStreamAddedTime':
                 getStringRepresentationForTweetTimestamp(test_time),
                 'mergedClustersList': ['cluster_8'],
                 'streams': [],
                 'dimensions': {}
             },
         ]
     }
     AnalyzeData.constructCrowdDataStructures(self.dataIterator)
 def test_getClusterInMapFormat(self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
     mergedCluster.mergedClustersList = [self.cluster1.clusterId]
     mergedCluster.lastStreamAddedTime = test_time
     mapReresentation = {'clusterId': mergedCluster.clusterId, 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(mergedCluster.lastStreamAddedTime), 'mergedClustersList': [self.cluster1.clusterId], 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}}
     self.assertEqual(mapReresentation, TwitterCrowdsSpecificMethods.getClusterInMapFormat(mergedCluster))