def getClusterForDocument(self, document):
     UtilityMethods.updatePhraseTextAndDimensionsMap(document, self.phraseTextAndDimensionMap, **self.clustering_settings)
     document.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap)
     predictedCluster = None
     possibleNearestNeighbors = reduce(lambda x,y:x.union(y), (permutation.getNearestDocuments(document) for permutation in self.signaturePermutations), set())
     if possibleNearestNeighbors: predictedCluster = max(((clusterId, self.clusters[clusterId].cosineSimilarity(document)) for clusterId in possibleNearestNeighbors), key=itemgetter(1))
     if predictedCluster and predictedCluster[1]>=self.thresholdForDocumentToBeInACluster:return predictedCluster[0]
Esempio n. 2
0
 def test_updatedPhraseObject_PhraseObjectScoresAreUpdatedCorrectly(self):
     UtilityMethods.updatePhraseTextToPhraseObject(
         self.phraseVector, test_time + timedelta(seconds=60),
         self.phraseTextToPhraseObjectMap, **stream_settings)
     self.assertEqual(5, len(self.phraseTextToPhraseObjectMap))
     self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score)
     self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score)
Esempio n. 3
0
 def cluster(self, dataIterator):
     i = 1
     for data in dataIterator:
         message = self.convertDataToMessageMethod(data,
                                                   **self.stream_settings)
         #            message = data
         if DataStreamMethods.messageInOrder(message.timeStamp):
             UtilityMethods.updatePhraseTextToPhraseObject(
                 message.vector, message.timeStamp,
                 self.phraseTextToPhraseObjectMap, **self.stream_settings)
             if message.streamId not in self.streamIdToStreamObjectMap:
                 self.streamIdToStreamObjectMap[message.streamId] = Stream(
                     message.streamId, message)
             else:
                 self.streamIdToStreamObjectMap[
                     message.streamId].updateForMessage(
                         message, VectorUpdateMethods.exponentialDecay,
                         **self.stream_settings)
             streamObject = self.streamIdToStreamObjectMap[message.streamId]
             self.updateDimensionsMethod.call(
                 message.timeStamp,
                 hdStreamClusteringObject=self,
                 currentMessageTime=message.timeStamp)
             self.clusterFilteringMethod.call(
                 message.timeStamp,
                 hdStreamClusteringObject=self,
                 currentMessageTime=message.timeStamp)
             self.clusterAnalysisMethod.call(
                 message.timeStamp,
                 hdStreamClusteringObject=self,
                 currentMessageTime=message.timeStamp)
             self.getClusterAndUpdateExistingClusters(streamObject)
    def cluster(self, dataIterator):
        i = 1
        for data in dataIterator:
            message = self.convertDataToMessageMethod(data, **self.stream_settings)
#            message = data
            if DataStreamMethods.messageInOrder(message.timeStamp):
                UtilityMethods.updatePhraseTextToPhraseObject(message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings)
                if message.streamId not in self.streamIdToStreamObjectMap: 
                    self.streamIdToStreamObjectMap[message.streamId] = Stream(message.streamId, message)
                    self.getClusterAndUpdateExistingClusters(self.streamIdToStreamObjectMap[message.streamId])
                else: 
                    previousStreamObject=Vector(vectorInitialValues=self.streamIdToStreamObjectMap[message.streamId])
                    self.streamIdToStreamObjectMap[message.streamId].updateForMessage(message, VectorUpdateMethods.exponentialDecay, **self.stream_settings )
                    streamObject=self.streamIdToStreamObjectMap[message.streamId]
                    distance = Vector.euclideanDistance(streamObject, previousStreamObject)
                    if distance>10: 
#                        print i, len(self.clusters), distance
                        self.getClusterAndUpdateExistingClusters(self.streamIdToStreamObjectMap[message.streamId])

                        self.updateDimensionsMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)
                        self.clusterFilteringMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)
        
        #                self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)
        
                    self.clusterAnalysisMethod.call(time.time(), hdStreamClusteringObject=self, currentMessageTime=message.timeStamp, numberOfMessages=i)

#                print i, len(self.clusters)
                i+=1
 def test_updateDimensions_when_phrases_with_lower_id_are_removed_from_phraseTextToIdMap(self):
     stream_settings['dimensions'] = 3
     for phrase, score in zip(['new', 'phrases', 'are'], range(100,103)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 2)
     self.phraseTextToPhraseObjectMap['cluster'].score=100
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
     self.assertEqual(range(3), sorted(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD).values()))
 def test_updateDimensions_when_dimensions_have_to_be_removed(self):
     stream_settings['dimensions'] = 4
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdx', 2)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdxy', 3)
     for phrase, score in zip(['new_text'], range(7,8)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score)
     self.phraseTextToPhraseObjectMap['cluster'].latestOccuranceTime=test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds']
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
     self.assertEqual(set({'project':0, 'new_text': 1}), set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))
 def getNearestDocument(self, document):
     UtilityMethods.updatePhraseTextAndDimensionsMap(document, self.phraseTextAndDimensionMap, **self.settings)
     document.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap)
     predictedNeighbor = None
     possibleNearestNeighbors = reduce(lambda x,y:x.union(y), (permutation.getNearestDocuments(document) for permutation in self.signaturePermutations), set())
     if possibleNearestNeighbors: predictedNeighbor = max(((docId, self.documentIdToDocumentMap[docId].cosineSimilarity(document)) for docId in possibleNearestNeighbors), key=itemgetter(1))
     print predictedNeighbor
     if predictedNeighbor and predictedNeighbor[1]>=self.nearestNeighborThreshold:return predictedNeighbor[0]
 def update(self, newDocument):
     UtilityMethods.updatePhraseTextAndDimensionsMap(newDocument, self.phraseTextAndDimensionMap, **self.settings)
     currentDocument = self.documentIdToDocumentMap.get(newDocument.docId, None)
     self.documentIdToDocumentMap[newDocument.docId] = newDocument
     newDocument.setSignatureUsingVectorPermutations(self.unitVector, self.vectorPermutations, self.phraseTextAndDimensionMap)
     for permutation in self.signaturePermutations: 
         if currentDocument!=None: permutation.removeDocument(currentDocument)
         permutation.addDocument(newDocument)
Esempio n. 9
0
 def updateDimensions(hdStreamClusteringObject, currentMessageTime):
     # Update dimensions.
     UtilityMethods.updateDimensions(
         hdStreamClusteringObject.phraseTextAndDimensionMap,
         hdStreamClusteringObject.phraseTextToPhraseObjectMap,
         currentMessageTime, **hdStreamClusteringObject.stream_settings)
     DataStreamMethods._resetClustersInSignatureTries(
         hdStreamClusteringObject, currentMessageTime)
 def run(self, dataIterator, estimationMethod, parameterSpecificDataCollectionMethod=None):
     estimationMethod = FixedIntervalMethod(estimationMethod, self.timeUnitInSeconds)
     for data in dataIterator:
         message = self.convertDataToMessageMethod(data, **self.stream_settings)
         if CDA.messageInOrder(message.timeStamp):
             if parameterSpecificDataCollectionMethod != None: parameterSpecificDataCollectionMethod(estimationObject=self, message=message)
             UtilityMethods.updatePhraseTextToPhraseObject(message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings)
             estimationMethod.call(message.timeStamp, estimationObject=self, currentMessageTime=message.timeStamp)
Esempio n. 11
0
 def test_updatePhraseTextAndDimensionsMap_PhraseMapHasLesserDimensions(
         self):
     settings['dimensions'] = 4
     UtilityMethods.updatePhraseTextAndDimensionsMap(
         self.phraseVector, self.phraseTextAndDimensionMap, **settings)
     self.assertEqual(
         self.finalPhraseToIdMap,
         self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
 def test_updateDimensions_remove_old_phrases(self):
     originalTime=self.phraseTextToPhraseObjectMap['abcd'].latestOccuranceTime
     self.phraseTextToPhraseObjectMap['abcd'].latestOccuranceTime=test_time
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
     self.assertTrue('abcd' in self.phraseTextToPhraseObjectMap)
     self.phraseTextToPhraseObjectMap['abcd'].latestOccuranceTime=originalTime
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
     self.assertTrue('abcd' not in self.phraseTextToPhraseObjectMap)
 def run(self, dataIterator, estimationMethod, parameterSpecificDataCollectionMethod=None):
     estimationMethod = FixedIntervalMethod(estimationMethod, self.timeUnitInSeconds)
     for data in dataIterator:
         message = self.convertDataToMessageMethod(data, **self.stream_settings)
         if CDA.messageInOrder(message.timeStamp):
             if parameterSpecificDataCollectionMethod != None: parameterSpecificDataCollectionMethod(estimationObject=self, message=message)
             UtilityMethods.updatePhraseTextToPhraseObject(message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings)
             estimationMethod.call(message.timeStamp, estimationObject=self, currentMessageTime=message.timeStamp)
 def test_updatedPhraseObject_phrase_does_not_exist_in_phraseToIdMap_but_exists_in_phraseTextToPhraseObjectMap_with_dimensions_full(self): 
     stream_settings['dimensions'] = 1
     self.phraseTextAndDimensionMap.remove(TwoWayMap.MAP_FORWARD, 'cluster')
     UtilityMethods.updatePhraseTextToPhraseObject(self.phraseVector, test_time+timedelta(seconds=60), self.phraseTextToPhraseObjectMap, **stream_settings)
     self.assertEqual({'project':0}, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
     self.assertEqual(5, len(self.phraseTextToPhraseObjectMap))
     self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score)
     self.assertEqual(5, self.phraseTextToPhraseObjectMap['cluster'].score)
     self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score)
Esempio n. 15
0
 def test_updatePhraseTextAndDimensionsMap_PhraseMapHasMaximumDimensions(
         self):
     UtilityMethods.updatePhraseTextAndDimensionsMap(
         self.phraseVector, self.phraseTextAndDimensionMap, **settings)
     for k in ['streams', 'highdimensional']:
         del self.finalPhraseToIdMap[k]
     self.assertEqual(
         self.finalPhraseToIdMap,
         self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
Esempio n. 16
0
 def test_createOrAddNewPhraseObject(self):
     UtilityMethods.createOrAddNewPhraseObject(
         'new_phrase', self.phraseTextToPhraseObjectMap, test_time,
         **stream_settings)
     UtilityMethods.createOrAddNewPhraseObject(
         'project', self.phraseTextToPhraseObjectMap, test_time,
         **stream_settings)
     self.assertEqual(4, len(self.phraseTextToPhraseObjectMap))
     self.assertEqual(1,
                      self.phraseTextToPhraseObjectMap['new_phrase'].score)
     self.assertEqual(9, self.phraseTextToPhraseObjectMap['project'].score)
Esempio n. 17
0
 def test_pruningConditionRandom(self):
     phrase1 = Phrase(
         'dsf', test_time -
         3 * stream_settings['max_phrase_inactivity_time_in_seconds'], 1)
     phrase2 = Phrase('dsf', test_time, 1)
     self.assertTrue(
         UtilityMethods.pruningConditionRandom(phrase1, test_time,
                                               **stream_settings))
     self.assertFalse(
         UtilityMethods.pruningConditionRandom(phrase2, test_time,
                                               **stream_settings))
 def update(self, newDocument):
     UtilityMethods.updatePhraseTextAndDimensionsMap(
         newDocument, self.phraseTextAndDimensionMap, **self.settings)
     currentDocument = self.documentIdToDocumentMap.get(
         newDocument.docId, None)
     self.documentIdToDocumentMap[newDocument.docId] = newDocument
     newDocument.setSignatureUsingVectorPermutations(
         self.unitVector, self.vectorPermutations,
         self.phraseTextAndDimensionMap)
     for permutation in self.signaturePermutations:
         if currentDocument != None:
             permutation.removeDocument(currentDocument)
         permutation.addDocument(newDocument)
Esempio n. 19
0
 def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions(
         self):
     for phrase, score in zip(['added'], range(10, 11)):
         self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                           test_time,
                                                           score=score)
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                     self.phraseTextToPhraseObjectMap,
                                     test_time, **stream_settings)
     self.assertEqual({
         'project': 0,
         'added': 1
     }, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
Esempio n. 20
0
    def cluster(self, dataIterator):
        i = 1
        for data in dataIterator:
            message = self.convertDataToMessageMethod(data,
                                                      **self.stream_settings)
            #            message = data
            if DataStreamMethods.messageInOrder(message.timeStamp):
                UtilityMethods.updatePhraseTextToPhraseObject(
                    message.vector, message.timeStamp,
                    self.phraseTextToPhraseObjectMap, **self.stream_settings)
                if message.streamId not in self.streamIdToStreamObjectMap:
                    self.streamIdToStreamObjectMap[message.streamId] = Stream(
                        message.streamId, message)
                    self.getClusterAndUpdateExistingClusters(
                        self.streamIdToStreamObjectMap[message.streamId])
                else:
                    previousStreamObject = Vector(
                        vectorInitialValues=self.streamIdToStreamObjectMap[
                            message.streamId])
                    self.streamIdToStreamObjectMap[
                        message.streamId].updateForMessage(
                            message, VectorUpdateMethods.exponentialDecay,
                            **self.stream_settings)
                    streamObject = self.streamIdToStreamObjectMap[
                        message.streamId]
                    distance = Vector.euclideanDistance(
                        streamObject, previousStreamObject)
                    if distance > 10:
                        #                        print i, len(self.clusters), distance
                        self.getClusterAndUpdateExistingClusters(
                            self.streamIdToStreamObjectMap[message.streamId])

                        self.updateDimensionsMethod.call(
                            message.timeStamp,
                            hdStreamClusteringObject=self,
                            currentMessageTime=message.timeStamp)
                        self.clusterFilteringMethod.call(
                            message.timeStamp,
                            hdStreamClusteringObject=self,
                            currentMessageTime=message.timeStamp)

        #                self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)

                    self.clusterAnalysisMethod.call(
                        time.time(),
                        hdStreamClusteringObject=self,
                        currentMessageTime=message.timeStamp,
                        numberOfMessages=i)

#                print i, len(self.clusters)
                i += 1
    def cluster(self, dataIterator):
        i = 1
        for data in dataIterator:
            message = self.convertDataToMessageMethod(data, **self.stream_settings)
#            message = data
            if DataStreamMethods.messageInOrder(message.timeStamp):
                UtilityMethods.updatePhraseTextToPhraseObject(message.vector, message.timeStamp, self.phraseTextToPhraseObjectMap, **self.stream_settings)
                if message.streamId not in self.streamIdToStreamObjectMap: self.streamIdToStreamObjectMap[message.streamId] = Stream(message.streamId, message)
                else: self.streamIdToStreamObjectMap[message.streamId].updateForMessage(message, VectorUpdateMethods.exponentialDecay, **self.stream_settings )
                streamObject=self.streamIdToStreamObjectMap[message.streamId]
                self.updateDimensionsMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)
                self.clusterFilteringMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)
                self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)
                self.getClusterAndUpdateExistingClusters(streamObject)
Esempio n. 22
0
 def test_updatedPhraseObject_phrase_does_not_exist_in_phraseToIdMap_but_exists_in_phraseTextToPhraseObjectMap_with_dimensions_full(
         self):
     stream_settings['dimensions'] = 1
     self.phraseTextAndDimensionMap.remove(TwoWayMap.MAP_FORWARD, 'cluster')
     UtilityMethods.updatePhraseTextToPhraseObject(
         self.phraseVector, test_time + timedelta(seconds=60),
         self.phraseTextToPhraseObjectMap, **stream_settings)
     self.assertEqual({'project': 0},
                      self.phraseTextAndDimensionMap.getMap(
                          TwoWayMap.MAP_FORWARD))
     self.assertEqual(5, len(self.phraseTextToPhraseObjectMap))
     self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score)
     self.assertEqual(5, self.phraseTextToPhraseObjectMap['cluster'].score)
     self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score)
Esempio n. 23
0
 def test_pruneUnnecessaryPhrases(self):
     phraseTextToPhraseObjectMap = {
         'dsf':
         Phrase(
             'dsf', test_time -
             3 * stream_settings['max_phrase_inactivity_time_in_seconds'],
             1),
         'abc':
         Phrase('abc', test_time, 1)
     }
     UtilityMethods.pruneUnnecessaryPhrases(
         phraseTextToPhraseObjectMap, test_time,
         UtilityMethods.pruningConditionRandom, **stream_settings)
     self.assertTrue('dsf' not in phraseTextToPhraseObjectMap)
     self.assertTrue('abc' in phraseTextToPhraseObjectMap)
Esempio n. 24
0
 def test_updateDimensions_remove_old_phrases(self):
     originalTime = self.phraseTextToPhraseObjectMap[
         'abcd'].latestOccuranceTime
     self.phraseTextToPhraseObjectMap[
         'abcd'].latestOccuranceTime = test_time
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                     self.phraseTextToPhraseObjectMap,
                                     test_time, **stream_settings)
     self.assertTrue('abcd' in self.phraseTextToPhraseObjectMap)
     self.phraseTextToPhraseObjectMap[
         'abcd'].latestOccuranceTime = originalTime
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                     self.phraseTextToPhraseObjectMap,
                                     test_time, **stream_settings)
     self.assertTrue('abcd' not in self.phraseTextToPhraseObjectMap)
Esempio n. 25
0
 def test_updateDimensions_when_phrases_with_lower_id_are_removed_from_phraseTextToIdMap(
         self):
     stream_settings['dimensions'] = 3
     for phrase, score in zip(['new', 'phrases', 'are'], range(100, 103)):
         self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                           test_time,
                                                           score=score)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 2)
     self.phraseTextToPhraseObjectMap['cluster'].score = 100
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                     self.phraseTextToPhraseObjectMap,
                                     test_time, **stream_settings)
     self.assertEqual(
         range(3),
         sorted(
             self.phraseTextAndDimensionMap.getMap(
                 TwoWayMap.MAP_FORWARD).values()))
Esempio n. 26
0
 def test_updateDimensions_when_dimensions_have_to_be_removed(self):
     stream_settings['dimensions'] = 4
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdx', 2)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'abcdxy', 3)
     for phrase, score in zip(['new_text'], range(7, 8)):
         self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                           test_time,
                                                           score=score)
     self.phraseTextToPhraseObjectMap[
         'cluster'].latestOccuranceTime = test_time - 3 * stream_settings[
             'max_phrase_inactivity_time_in_seconds']
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                     self.phraseTextToPhraseObjectMap,
                                     test_time, **stream_settings)
     self.assertEqual(
         set({
             'project': 0,
             'new_text': 1
         }),
         set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))
Esempio n. 27
0
 def test_updateDimensions_when_phraseTextToIdMap_has_lesser_than_max_dimensions(
         self):
     stream_settings['dimensions'] = 4
     for phrase, score in zip(['new', 'phrases', 'are', 'added'],
                              range(7, 11)):
         self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase,
                                                           test_time,
                                                           score=score)
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap,
                                     self.phraseTextToPhraseObjectMap,
                                     test_time, **stream_settings)
     self.assertEqual(
         set({
             'project': 0,
             'phrases': 1,
             'are': 2,
             'added': 3
         }),
         set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))
     self.assertEqual(4, len(self.phraseTextAndDimensionMap))
Esempio n. 28
0
 def getClusterForDocument(self, document):
     UtilityMethods.updatePhraseTextAndDimensionsMap(
         document, self.phraseTextAndDimensionMap,
         **self.clustering_settings)
     document.setSignatureUsingVectorPermutations(
         self.unitVector, self.vectorPermutations,
         self.phraseTextAndDimensionMap)
     predictedCluster = None
     possibleNearestNeighbors = reduce(
         lambda x, y: x.union(y),
         (permutation.getNearestDocuments(document)
          for permutation in self.signaturePermutations), set())
     if possibleNearestNeighbors:
         predictedCluster = max(
             ((clusterId,
               self.clusters[clusterId].cosineSimilarity(document))
              for clusterId in possibleNearestNeighbors),
             key=itemgetter(1))
     if predictedCluster and predictedCluster[
             1] >= self.thresholdForDocumentToBeInACluster:
         return predictedCluster[0]
 def getNearestDocument(self, document):
     UtilityMethods.updatePhraseTextAndDimensionsMap(
         document, self.phraseTextAndDimensionMap, **self.settings)
     document.setSignatureUsingVectorPermutations(
         self.unitVector, self.vectorPermutations,
         self.phraseTextAndDimensionMap)
     predictedNeighbor = None
     possibleNearestNeighbors = reduce(
         lambda x, y: x.union(y),
         (permutation.getNearestDocuments(document)
          for permutation in self.signaturePermutations), set())
     if possibleNearestNeighbors:
         predictedNeighbor = max(((
             docId,
             self.documentIdToDocumentMap[docId].cosineSimilarity(document))
                                  for docId in possibleNearestNeighbors),
                                 key=itemgetter(1))
     print predictedNeighbor
     if predictedNeighbor and predictedNeighbor[
             1] >= self.nearestNeighborThreshold:
         return predictedNeighbor[0]
 def test_pruningConditionRandom(self):
     phrase1 = Phrase('dsf', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], 1)
     phrase2 = Phrase('dsf', test_time, 1)
     self.assertTrue(UtilityMethods.pruningConditionRandom(phrase1, test_time, **stream_settings))
     self.assertFalse(UtilityMethods.pruningConditionRandom(phrase2, test_time, **stream_settings))
Esempio n. 31
0
 def test_updatePhraseTextAndDimensionsMap_PhraseMapHasLesserDimensions(self):
     settings['dimensions'] = 4
     UtilityMethods.updatePhraseTextAndDimensionsMap(self.phraseVector, self.phraseTextAndDimensionMap, **settings)
     self.assertEqual(self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
Esempio n. 32
0
 def test_updatePhraseTextAndDimensionsMap_PhraseMapHasMaximumDimensions(self):
     UtilityMethods.updatePhraseTextAndDimensionsMap(self.phraseVector, self.phraseTextAndDimensionMap, **settings)
     for k in ['streams', 'highdimensional']: del self.finalPhraseToIdMap[k]
     self.assertEqual(self.finalPhraseToIdMap, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
 def test_updateDimensions_when_phraseTextToIdMap_has_lesser_than_max_dimensions(self):
     stream_settings['dimensions'] = 4
     for phrase, score in zip(['new', 'phrases', 'are', 'added'], range(7,11)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score)
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
     self.assertEqual(set({'project':0, 'phrases': 1, 'are':2, 'added':3}), set(self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD)))
     self.assertEqual(4, len(self.phraseTextAndDimensionMap))
 def test_updateDimensions_when_phraseTextToIdMap_is_filled_to_max_dimensions_and_entire_map_is_changed(self):
     for phrase, score in zip(['added', 'are'], range(10,12)): self.phraseTextToPhraseObjectMap[phrase] = Phrase(phrase, test_time, score=score)
     UtilityMethods.updateDimensions(self.phraseTextAndDimensionMap, self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
     self.assertEqual({'added':1, 'are': 0}, self.phraseTextAndDimensionMap.getMap(TwoWayMap.MAP_FORWARD))
 def test_createOrAddNewPhraseObject(self):
     UtilityMethods.createOrAddNewPhraseObject('new_phrase', self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
     UtilityMethods.createOrAddNewPhraseObject('project', self.phraseTextToPhraseObjectMap, test_time, **stream_settings)
     self.assertEqual(4, len(self.phraseTextToPhraseObjectMap))
     self.assertEqual(1, self.phraseTextToPhraseObjectMap['new_phrase'].score)
     self.assertEqual(9, self.phraseTextToPhraseObjectMap['project'].score)
 def test_updatedPhraseObject_PhraseObjectScoresAreUpdatedCorrectly(self): 
     UtilityMethods.updatePhraseTextToPhraseObject(self.phraseVector, test_time+timedelta(seconds=60), self.phraseTextToPhraseObjectMap, **stream_settings)
     self.assertEqual(5, len(self.phraseTextToPhraseObjectMap))
     self.assertEqual(5, self.phraseTextToPhraseObjectMap['project'].score)
     self.assertEqual(1, self.phraseTextToPhraseObjectMap['streams'].score)
 def test_pruneUnnecessaryPhrases(self):
     phraseTextToPhraseObjectMap = {'dsf': Phrase('dsf', test_time-3*stream_settings['max_phrase_inactivity_time_in_seconds'], 1), 'abc': Phrase('abc', test_time, 1)}
     UtilityMethods.pruneUnnecessaryPhrases(phraseTextToPhraseObjectMap, test_time, UtilityMethods.pruningConditionRandom, **stream_settings)
     self.assertTrue('dsf' not in phraseTextToPhraseObjectMap)
     self.assertTrue('abc' in phraseTextToPhraseObjectMap)
 def updateDimensions(hdStreamClusteringObject, currentMessageTime): 
     # Update dimensions.
     UtilityMethods.updateDimensions(hdStreamClusteringObject.phraseTextAndDimensionMap, hdStreamClusteringObject.phraseTextToPhraseObjectMap, currentMessageTime, **hdStreamClusteringObject.stream_settings)
     DataStreamMethods._resetClustersInSignatureTries(hdStreamClusteringObject, currentMessageTime)