Example #1
0
 def setUp(self):
     self.phraseVector = {
         'project': 1,
         'cluster': 1,
         'highdimensional': 1,
         'streams': 1
     }
     self.phraseTextAndDimensionMap = TwoWayMap()
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
     self.phraseTextToPhraseObjectMap = {
         'project':
         Phrase('project', test_time, score=8),
         'cluster':
         Phrase('cluster', test_time, score=8),
         'abcd':
         Phrase(
             'abcd',
             test_time -
             3 * stream_settings['max_phrase_inactivity_time_in_seconds'],
             score=8)
     }
     self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1})
     self.initial_max_dimensions = stream_settings['dimensions']
     stream_settings['dimensions'] = 2
Example #2
0
 def test_pruningConditionRandom(self):
     phrase1 = Phrase(
         'dsf', test_time -
         3 * stream_settings['max_phrase_inactivity_time_in_seconds'], 1)
     phrase2 = Phrase('dsf', test_time, 1)
     self.assertTrue(
         UtilityMethods.pruningConditionRandom(phrase1, test_time,
                                               **stream_settings))
     self.assertFalse(
         UtilityMethods.pruningConditionRandom(phrase2, test_time,
                                               **stream_settings))
class PhraseTests(unittest.TestCase):
    def setUp(self):
        self.phrase1 = Phrase('abc', test_time, score=8)
        self.phrase2 = Phrase('xyz', test_time, score=7)
    def test_updateScore(self):
        self.phrase1.updateScore(test_time+timedelta(seconds=120), 0, **stream_settings)
        self.assertEqual(2, self.phrase1.score)
        self.assertEqual(test_time+timedelta(seconds=120), self.phrase1.latestOccuranceTime)
    def test_sort(self):
        self.assertEqual([self.phrase2, self.phrase1], Phrase.sort([self.phrase1, self.phrase2]))
        self.assertEqual([self.phrase1, self.phrase2], Phrase.sort([self.phrase1, self.phrase2], reverse=True))
Example #4
0
 def test_pruneUnnecessaryPhrases(self):
     phraseTextToPhraseObjectMap = {
         'dsf':
         Phrase(
             'dsf', test_time -
             3 * stream_settings['max_phrase_inactivity_time_in_seconds'],
             1),
         'abc':
         Phrase('abc', test_time, 1)
     }
     UtilityMethods.pruneUnnecessaryPhrases(
         phraseTextToPhraseObjectMap, test_time,
         UtilityMethods.pruningConditionRandom, **stream_settings)
     self.assertTrue('dsf' not in phraseTextToPhraseObjectMap)
     self.assertTrue('abc' in phraseTextToPhraseObjectMap)
 def dimensionsEstimation(estimationObject, currentMessageTime):
     '''
     This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
     the number of phrases that need to added every iteration for different dimensions.
     The dimension at which the number of phrases added stablizes is the number of dimensions
     for the stream.
     
     Why do we need this?
     The aim is to get dimensions, that dont change too often at the same time are not very huge.
     This experiments gives us an approximate idea of the number of dimensions. Randomly picking 
     a small value will result in dimensions that are not good and picking too big a value will 
     result in inefficiency.  
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)]
     oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
     if estimationObject.topDimensionsDuringPreviousIteration:
         dimensions_estimation = {}
         for boundary in estimationObject.boundaries:
             if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary]))
         print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap)
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': estimationObject.stream_settings.convertToSerializableObject(),
                          ParameterEstimation.dimensionsEstimationId:dimensions_estimation
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile)
     estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
 def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
     '''
     Observe the new dimensions that get added to current dimension if the dimensions 
     are being updated at regular intervals.
     For example, number of dimensions being added after 10m, 20m,... 5 horus. 
     As time increases the number of 'decayed' dimensions increase. The current dimensions
     has a lot of unwanted decayed dimensions. Using this information identify the time 
     interval that is best suited to refresh dimensions. 
     Tentative: We decide to pick the time interval at which the rate of decay is maximum.
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     dimensions = estimationObject.stream_settings['dimensions']
     newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
     print currentMessageTime, len(newList)
     if len(newList) >= dimensions:
         idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
         dimensionsUpdateFrequency = {}
         for td, id in idsOfDimensionsListToCompare:
             oldList = estimationObject.dimensionListsMap[id]
             dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
         print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': pprint.pformat(estimationObject.stream_settings),
                           ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
         estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
         for key in estimationObject.dimensionListsMap.keys()[:]:
             if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
 def dimensionsUpdateFrequencyEstimation(estimationObject, currentMessageTime):
     '''
     Observe the new dimensions that get added to current dimension if the dimensions 
     are being updated at regular intervals.
     For example, number of dimensions being added after 10m, 20m,... 5 horus. 
     As time increases the number of 'decayed' dimensions increase. The current dimensions
     has a lot of unwanted decayed dimensions. Using this information identify the time 
     interval that is best suited to refresh dimensions. 
     Tentative: We decide to pick the time interval at which the rate of decay is maximum.
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     dimensions = estimationObject.stream_settings['dimensions']
     newList = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)][:dimensions]
     print currentMessageTime, len(newList)
     if len(newList) >= dimensions:
         idsOfDimensionsListToCompare = [(i, GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i)) for i in estimationObject.dimensionUpdateTimeDeltas if GeneralMethods.approximateToNearest5Minutes(currentMessageTime - i) in estimationObject.dimensionListsMap]
         dimensionsUpdateFrequency = {}
         for td, id in idsOfDimensionsListToCompare:
             oldList = estimationObject.dimensionListsMap[id]
             dimensionsUpdateFrequency[str(td.seconds)] = len(set(newList).difference(oldList))
         print len(estimationObject.dimensionListsMap), currentMessageTime, len(newList), [(k, dimensionsUpdateFrequency[k]) for k in sorted(dimensionsUpdateFrequency)]
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': pprint.pformat(estimationObject.stream_settings),
                           ParameterEstimation.dimensionsUpdateFrequencyId:dimensionsUpdateFrequency
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsUpdateFrequencyFile)
         estimationObject.dimensionListsMap[GeneralMethods.approximateToNearest5Minutes(currentMessageTime)] = newList[:]
         for key in estimationObject.dimensionListsMap.keys()[:]:
             if currentMessageTime - key > estimationObject.dimensionUpdateTimeDeltas[-1]: del estimationObject.dimensionListsMap[key]
 def dimensionsEstimation(estimationObject, currentMessageTime):
     '''
     This class is used to dimensionsEstimation dimensions in the stream. To dimensionsEstimation it we calculate
     the number of phrases that need to added every iteration for different dimensions.
     The dimension at which the number of phrases added stablizes is the number of dimensions
     for the stream.
     
     Why do we need this?
     The aim is to get dimensions, that dont change too often at the same time are not very huge.
     This experiments gives us an approximate idea of the number of dimensions. Randomly picking 
     a small value will result in dimensions that are not good and picking too big a value will 
     result in inefficiency.  
     '''
     def updatePhraseScore(phraseObject): 
         phraseObject.updateScore(currentMessageTime, 0, **estimationObject.stream_settings)
         return phraseObject
     topDimensionsDuringCurrentIteration = [p.text for p in Phrase.sort((updatePhraseScore(p) for p in estimationObject.phraseTextToPhraseObjectMap.itervalues()), reverse=True)]
     oldList, newList = estimationObject.topDimensionsDuringPreviousIteration, topDimensionsDuringCurrentIteration
     if estimationObject.topDimensionsDuringPreviousIteration:
         dimensions_estimation = {}
         for boundary in estimationObject.boundaries:
             if boundary < len(estimationObject.phraseTextToPhraseObjectMap): dimensions_estimation[str(boundary)] = len(set(newList[:boundary]).difference(oldList[:boundary]))
         print currentMessageTime, len(estimationObject.phraseTextToPhraseObjectMap)
         iterationData = {
                          'time_stamp': getStringRepresentationForTweetTimestamp(currentMessageTime),
                          'total_number_of_phrases': len(estimationObject.phraseTextToPhraseObjectMap),
                          'settings': estimationObject.stream_settings.convertToSerializableObject(),
                          ParameterEstimation.dimensionsEstimationId:dimensions_estimation
                          }
         FileIO.writeToFileAsJson(iterationData, estimationObject.dimensionsEstimationFile)
     estimationObject.topDimensionsDuringPreviousIteration = topDimensionsDuringCurrentIteration[:]
Example #9
0
class PhraseTests(unittest.TestCase):
    def setUp(self):
        self.phrase1 = Phrase('abc', test_time, score=8)
        self.phrase2 = Phrase('xyz', test_time, score=7)

    def test_updateScore(self):
        self.phrase1.updateScore(test_time + timedelta(seconds=120), 0,
                                 **stream_settings)
        self.assertEqual(2, self.phrase1.score)
        self.assertEqual(test_time + timedelta(seconds=120),
                         self.phrase1.latestOccuranceTime)

    def test_sort(self):
        self.assertEqual([self.phrase2, self.phrase1],
                         Phrase.sort([self.phrase1, self.phrase2]))
        self.assertEqual([self.phrase1, self.phrase2],
                         Phrase.sort([self.phrase1, self.phrase2],
                                     reverse=True))
Example #10
0
 def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions(
         self):
     for phrase, score in zip(['added'], range(10, 11)):
         self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                           test_time,
                                                           score=score)
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                     self.phraseTextToPhraseObjectMap,
                                     test_time, **stream_settings)
     self.assertEqual({
         'project': 0,
         'added': 1
     }, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
Example #11
0
 def test_updateDimensions_when_phrases_with_lower_id_are_removed_from_phraseTextToIdMap(
         self):
     stream_settings['dimensions'] = 3
     for phrase, score in zip(['new', 'phrases', 'are'], range(100, 103)):
         self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                           test_time,
                                                           score=score)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 2)
     self.phraseTextToPhraseObjectMap['cluster'].score = 100
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                     self.phraseTextToPhraseObjectMap,
                                     test_time, **stream_settings)
     self.assertEqual(
         range(3),
         sorted(
             self.phraseTextAndDimensionMap.getMap(
                 TwoWayMap.MAP_FORWARD).values()))
Example #12
0
 def test_updateDimensions_when_phraseTextToIdMap_has_lesser_than_max_dimensions(
         self):
     stream_settings['dimensions'] = 4
     for phrase, score in zip(['new', 'phrases', 'are', 'added'],
                              range(7, 11)):
         self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                           test_time,
                                                           score=score)
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                     self.phraseTextToPhraseObjectMap,
                                     test_time, **stream_settings)
     self.assertEqual(
         set({
             'project': 0,
             'phrases': 1,
             'are': 2,
             'added': 3
         }),
         set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))
     self.assertEqual(4, len(self.phraseTextAndDimensionMap))
Example #13
0
 def test_updateDimensions_when_dimensions_have_to_be_removed(self):
     stream_settings['dimensions'] = 4
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdx', 2)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdxy', 3)
     for phrase, score in zip(['new_text'], range(7, 8)):
         self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                           test_time,
                                                           score=score)
     self.phraseTextToPhraseObjectMap[
         'cluster'].latestOccuranceTime = test_time - 3 * stream_settings[
             'max_phrase_inactivity_time_in_seconds']
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                     self.phraseTextToPhraseObjectMap,
                                     test_time, **stream_settings)
     self.assertEqual(
         set({
             'project': 0,
             'new_text': 1
         }),
         set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))
 def test_sort(self):
     self.assertEqual([self.phrase2, self.phrase1], Phrase.sort([self.phrase1, self.phrase2]))
     self.assertEqual([self.phrase1, self.phrase2], Phrase.sort([self.phrase1, self.phrase2], reverse=True))
 def setUp(self):
     self.phrase1 = Phrase('abc', test_time, score=8)
     self.phrase2 = Phrase('xyz', test_time, score=7)
Example #16
0
 def test_sort(self):
     self.assertEqual([self.phrase2, self.phrase1],
                      Phrase.sort([self.phrase1, self.phrase2]))
     self.assertEqual([self.phrase1, self.phrase2],
                      Phrase.sort([self.phrase1, self.phrase2],
                                  reverse=True))
Example #17
0
 def setUp(self):
     self.phrase1 = Phrase('abc', test_time, score=8)
     self.phrase2 = Phrase('xyz', test_time, score=7)