def test_append(self):
     self.crowd.append(self.cluster, test_time+timedelta(days=1))
     self.assertEqual([GeneralMethods.getEpochFromDateTimeObject(test_time), GeneralMethods.getEpochFromDateTimeObject(test_time+timedelta(days=1))], sorted(self.crowd.clusters.keys()))
     self.assertEqual(StreamCluster, type(self.crowd.clusters[GeneralMethods.getEpochFromDateTimeObject(test_time)]))
     self.assertEqual(2, self.crowd.lifespan)
     self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(self.crowd.startTime))
     self.assertEqual(getStringRepresentationForTweetTimestamp(test_time+timedelta(days=1)), getStringRepresentationForTweetTimestamp(self.crowd.endTime))
 def test_getClusterFromMapFormat(self):
     mapReresentation = {'clusterId': 1, 'mergedClustersList': [self.cluster1.clusterId], 'lastStreamAddedTime': getStringRepresentationForTweetTimestamp(test_time), 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}}
     cluster = TwitterCrowdsSpecificMethods.getClusterFromMapFormat(mapReresentation)
     self.assertEqual(1, cluster.clusterId)
     self.assertEqual([self.cluster1.clusterId], cluster.mergedClustersList)
     self.assertEqual([self.doc1.docId], cluster.documentsInCluster)
     self.assertEqual({'#tcot':2, 'dsf':2}, cluster)
     self.assertEqual(getStringRepresentationForTweetTimestamp(test_time), getStringRepresentationForTweetTimestamp(cluster.lastStreamAddedTime))
 def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [cluster.documentsInCluster.keys() for k, cluster in hdStreamClusteringObject.clusters.iteritems() if len(cluster.documentsInCluster.keys())>=experts_twitter_stream_settings['cluster_filter_threshold']]
     iteration_data = evaluation.getEvaluationMetrics(documentClusters, currentTime-previousTime, {'type': experts_twitter_stream_settings['lsh_type'], 'total_clusters': len(hdStreamClusteringObject.clusters), 'current_time': getStringRepresentationForTweetTimestamp(currentMessageTime)})
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyNotUsingVanillaLSH.stats_file)
     del iteration_data['clusters']
     print getStringRepresentationForTweetTimestamp(currentMessageTime), iteration_data
 def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
     '''
     Observe the new dimensions that get added to current dimension if the dimensions 
     are being updated at regular intervals.
     For example, number of dimensions being added after 10m, 20m,... 5 horus. 
     As time increases the number of 'decayed' dimensions increase. The current dimensions
     has a lot of unwanted decayed dimensions. Using this information identify the time 
     interval that is best suited to refresh dimensions. 
     Tentative: We decide to pick the time interval at which the rate of decay is maximum.
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     dimensions = estimationObject.stream_settings['dimensions']
     newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
     print currentMessageTime, len(newList)
     if len(newList) >= dimensions:
         idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
         dimensionsUpdateFrequency = {}
         for td, id in idsOfDimensionsListToCompare:
             oldList = estimationObject.dimensionListsMap[id]
             dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
         print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': pprint.pformat(estimationObject.stream_settings),
                           ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
         estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
         for key in estimationObject.dimensionListsMap.keys()[:]:
             if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
 def dimensionsEstimation(estimationObject, currentMessageTime):
     '''
     This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
     the number of phrases that need to added every iteration for different dimensions.
     The dimension at which the number of phrases added stablizes is the number of dimensions
     for the stream.
     
     Why do we need this?
     The aim is to get dimensions, that dont change too often at the same time are not very huge.
     This experiments gives us an approximate idea of the number of dimensions. Randomly picking 
     a small value will result in dimensions that are not good and picking too big a value will 
     result in inefficiency.  
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)]
     oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
     if estimationObject.topDimensionsDuringPreviousIteration:
         dimensions_estimation = {}
         for boundary in estimationObject.boundaries:
             if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary]))
         print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap)
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': estimationObject.stream_settings.convertToSerializableObject(),
                          ParameterEstimation.dimensionsEstimationId:dimensions_estimation
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile)
     estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
 def writeClusters(hdStreamClusteringObject, currentMessageTime):
     print '\n\n\nEntering:', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
     iterationData = {'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'clusters': map(TwitterCrowdsSpecificMethods.getClusterInMapFormat, [cluster for cluster, _ in sorted(StreamCluster.iterateByAttribute(hdStreamClusteringObject.clusters.values(), 'length'), key=itemgetter(1), reverse=True)]),
                      'settings': Settings.getSerialzedObject(hdStreamClusteringObject.stream_settings)
                      }
     FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['lsh_clusters_folder']+FileIO.getFileByDay(currentMessageTime))
     print 'Leaving: ', currentMessageTime, len(hdStreamClusteringObject.phraseTextAndDimensionMap), len(hdStreamClusteringObject.phraseTextToPhraseObjectMap), len(hdStreamClusteringObject.clusters)
 def writeTweetsForDay(currentDay):
     fileName = houston_data_folder+FileIO.getFileByDay(currentDay)
     for tweet in tweets.find({'ca': {'$gt':currentDay, '$lt': currentDay+timedelta(seconds=86399)}}, fields=['ca', 'tx', 'uid']):
         screenName = GenerateHoustonTweetsData.getScreenName(tweet['uid'])
         if screenName!=None: 
             data = {'id': tweet['_id'], 'text': tweet['tx'], 'created_at':getStringRepresentationForTweetTimestamp(tweet['ca']), 'user':{'screen_name': GenerateHoustonTweetsData.getScreenName(tweet['uid'])}}
             FileIO.writeToFileAsJson(data, fileName) 
     os.system('gzip %s'%fileName)
 def modifiedClusterAnalysisMethod(hdStreamClusteringObject, currentMessageTime):
     global evaluation, previousTime
     currentTime = time.time()
     documentClusters = [
         cluster.documentsInCluster.keys()
         for k, cluster in hdStreamClusteringObject.clusters.iteritems()
         if len(cluster.documentsInCluster.keys()) >= experts_twitter_stream_settings["cluster_filter_threshold"]
     ]
     iteration_data = evaluation.getEvaluationMetrics(
         documentClusters,
         currentTime - previousTime,
         {
             "type": experts_twitter_stream_settings["trie_type"],
             "total_clusters": len(hdStreamClusteringObject.clusters),
             "current_time": getStringRepresentationForTweetTimestamp(currentMessageTime),
         },
     )
     previousTime = time.time()
     FileIO.writeToFileAsJson(iteration_data, JustifyTrie.stats_file)
     del iteration_data["clusters"]
     print getStringRepresentationForTweetTimestamp(currentMessageTime), iteration_data
 def dimensionInActivityTimeEstimation(estimationObject, currentMessageTime):
     phrasesLagDistribution = defaultdict(int)
     for phraseObject in estimationObject.phraseTextToPhraseObjectMap.itervalues():
         lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, phraseObject.latestOccuranceTime, estimationObject.stream_settings['time_unit_in_seconds'].seconds)
         phrasesLagDistribution[str(lag)] += 1
     print currentMessageTime
     iterationData = {
                      'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                      'settings': pprint.pformat(estimationObject.stream_settings),
                      ParameterEstimation.dimensionInActivityTimeId:estimationObject.lagBetweenMessagesDistribution,
                      'phrases_lag_distribution': phrasesLagDistribution
                      }
     FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionInActivityTimeFile)
    def clusterLagDistributionMethod(hdStreamClusteringObject, currentMessageTime):
        lagDistribution = defaultdict(int)
        for cluster in hdStreamClusteringObject.clusters.values():
            lag = DateTimeAirthematic.getDifferenceInTimeUnits(currentMessageTime, cluster.lastStreamAddedTime, hdStreamClusteringObject.stream_settings['time_unit_in_seconds'].seconds)
            lagDistribution[str(lag)] += 1
        print currentMessageTime, len(hdStreamClusteringObject.clusters)
        iterationData = {
                         'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                         'settings': pprint.pformat(hdStreamClusteringObject.stream_settings),
                         ClusteringParametersEstimation.clusterLagDistributionId: lagDistribution,
                         'lag_between_streams_added_to_cluster': hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
                         }
#        print hdStreamClusteringObject.stream_settings['lag_between_streams_added_to_cluster']
        FileIO.writeToFileAsJson(iterationData, hdStreamClusteringObject.stream_settings['%s_file' % ClusteringParametersEstimation.clusterLagDistributionId])
 def setUp(self):
     AnalyzeData.crowdMap, AnalyzeData.clusterIdToCrowdIdMap, AnalyzeData.crowdIdToClusterIdMap = {}, {}, {}
     self.clusterMaps = {
                    test_time: [
                         {'clusterId': 'cluster_4', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_1'], 'streams': [], 'dimensions': {}},
                         {'clusterId': 'cluster_5', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_2'], 'streams': [], 'dimensions': {}},
                         {'clusterId': 'cluster_6', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_3'], 'streams': [], 'dimensions': {}},
                         ],
                    test_time+timedelta(seconds=30*60): [
                         {'clusterId': 'cluster_7', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_4'], 'streams': [], 'dimensions': {}},
                         {'clusterId': 'cluster_8', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_5','cluster_6'], 'streams': [], 'dimensions': {}},
                         ],
                     test_time+2*timedelta(seconds=30*60): [
                         {'clusterId': 'cluster_9', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_7'], 'streams': [], 'dimensions': {}},
                         {'clusterId': 'cluster_10', 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(test_time), 'mergedClustersList': ['cluster_8'], 'streams': [], 'dimensions': {}},
                         ]
                    }
     AnalyzeData.constructCrowdDataStructures(self.dataIterator)
 def test_getClusterInMapFormat(self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
     mergedCluster.mergedClustersList = [self.cluster1.clusterId]
     mergedCluster.lastStreamAddedTime = test_time
     mapReresentation = {'clusterId': mergedCluster.clusterId, 'lastStreamAddedTime':getStringRepresentationForTweetTimestamp(mergedCluster.lastStreamAddedTime), 'mergedClustersList': [self.cluster1.clusterId], 'streams': [self.doc1.docId], 'dimensions': {'#tcot':2, 'dsf':2}}
     self.assertEqual(mapReresentation, TwitterCrowdsSpecificMethods.getClusterInMapFormat(mergedCluster))
 def getClusterInMapFormat(cluster, numberOfMaxDimensionsToRepresent=20): 
     return {'clusterId': cluster.clusterId, 'mergedClustersList': cluster.mergedClustersList, 'lastStreamAddedTime': getStringRepresentationForTweetTimestamp(cluster.lastStreamAddedTime),
            'streams': [stream.docId for stream in cluster.iterateDocumentsInCluster()],
            'dimensions': cluster.getTopDimensions(numberOfFeatures=numberOfMaxDimensionsToRepresent)}