Beispiel #1
0
 def setUp(self):
     self.message = Message(
         1, 'sdf', 'A project to cluster high-dimensional streams.',
         datetime.now())
     self.message.vector = Vector({1: 2., 2: 3.})
     self.s1 = Stream(1, self.message)
     self.v1 = Vector({1: 2., 3: 3.})
Beispiel #2
0
 def test_addDocument(self):
     message1 = Message(3, 'sdf',
                        'A project to cluster high-dimensional streams.',
                        test_time)
     message1.vector = Vector({3: 4})
     stream1 = Stream(3, message1)
     message2 = Message(4, 'sdf',
                        'A project to cluster high-dimensional streams.',
                        test_time)
     message2.vector = Vector({2: 4})
     stream2 = Stream(4, message2)
     self.assertNotEqual(test_time, self.cluster1.lastStreamAddedTime)
     self.cluster1.addDocument(stream1)
     self.assertEqual(test_time, self.cluster1.lastStreamAddedTime)
     # Test if cluster id is set.
     self.assertEqual(self.cluster1.clusterId, stream1.clusterId)
     # Test that cluster mean is updated.
     self.assertEqual({1: 2 / 2., 2: 2., 3: 2.}, self.cluster1)
     # Test that cluster aggrefate is updated.
     self.assertEqual({1: 2, 2: 4, 3: 4}, self.cluster1.aggregateVector)
     # Test that document is added to cluster documents.
     self.assertEqual(stream1,
                      self.cluster1.documentsInCluster[stream1.docId])
     self.cluster1.addDocument(stream2)
     self.assertEqual(3, self.cluster1.vectorWeights)
     self.assertEqual({1: 2 / 3., 2: 8 / 3., 3: 4 / 3.}, self.cluster1)
     self.assertEqual({1: 2, 2: 8, 3: 4}, self.cluster1.aggregateVector)
Beispiel #3
0
 def setUp(self):
     Cluster.clusterIdCounter = 0
     self.docx = Document(1, {1: 2, 2: 4})
     self.docy = Document(2, {2: 4})
     self.cluster1 = Cluster(self.docx)
     self.cluster2 = Cluster(self.docy)
     self.doc1 = Document(3, Vector({3: 4}))
     self.doc2 = Document(4, Vector({2: 4}))
 def mapper(self, _, value):
     if False: yield  # I'm a generator!
     [(id0, vec0), (id1, vec1)] = value
     vec0, vec1 = Vector(vec0), Vector(vec1)
     if vec0.cosineSimilarity(vec1) >= self.ssa_threshold:
         self.streamIdToSimilarStreamsMap[id0].add(
             id1
         ) if id0 < id1 else self.streamIdToSimilarStreamsMap[id1].add(id0)
Beispiel #5
0
 def setUp(self):
     self.m1 = Message(1, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time - timedelta(seconds=60))
     self.m1.vector = Vector({1: 1., 2: 3.})
     self.stream = Stream(1, self.m1)
     self.m2 = Message(1, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time)
     self.m2.vector = Vector({2: 3.})
Beispiel #6
0
 def setUp(self):
     self.phraseVector = {
         'project': 1,
         'cluster': 1,
         'highdimensional': 1,
         'streams': 1
     }
     self.phraseTextAndDimensionMap = TwoWayMap()
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'project', 0)
     self.phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'cluster', 1)
     self.phraseTextToPhraseObjectMap = {
         'project':
         Phrase('project', test_time, score=8),
         'cluster':
         Phrase('cluster', test_time, score=8),
         'abcd':
         Phrase(
             'abcd',
             test_time -
             3 * stream_settings['max_phrase_inactivity_time_in_seconds'],
             score=8)
     }
     self.vector = Vector({0: 1, 1: 1, 2: 1, 3: 1})
     self.initial_max_dimensions = stream_settings['dimensions']
     stream_settings['dimensions'] = 2
 def getClusterObjectToMergeFrom(streamCluster):
     streamCluster.lastMessageTime = streamCluster.lastStreamAddedTime
     mergedCluster = StreamCluster(streamCluster,
                                   shouldUpdateDocumentId=False)
     mergedCluster.aggregateVector, mergedCluster.vectorWeights = Vector(
         {}), 0.0
     StreamCluster.updateClusterAttributes(mergedCluster, streamCluster)
     return mergedCluster
Beispiel #8
0
 def _getVectorMappedToDimension(self, vector, phraseTextAndDimensionMap):
     mappedVector = Vector()
     phraseToDimensionMap = phraseTextAndDimensionMap.getMap(
         TwoWayMap.MAP_FORWARD)
     for phrase in self:
         if phrase in phraseToDimensionMap:
             mappedVector[phraseToDimensionMap[phrase]] = self[phrase]
     return mappedVector
Beispiel #9
0
 def test_maxClusterSize(self):
     self.assertEqual(1, self.crowd.maxClusterSize)
     message2 = Message(4, 'sdf',
                        'A project to cluster high-dimensional streams.',
                        test_time)
     message2.vector = Vector({2: 4})
     stream2 = Stream(4, message2)
     self.cluster.addDocument(stream2)
     self.assertEqual(2, self.crowd.maxClusterSize)
Beispiel #10
0
 def setUp(self):
     self.m1 = Message(1, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time - timedelta(seconds=60))
     self.m1.vector = Vector({1: 2, 2: 4})
     self.stream1 = Stream(1, self.m1)
     self.m2 = Message(2, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time)
     self.m2.vector = Vector({2: 4})
     self.stream2 = Stream(2, self.m2)
     self.m3 = Message(3, 'sdf',
                       'A project to cluster high-dimensional streams.',
                       test_time + timedelta(seconds=60))
     self.m3.vector = Vector({2: 4})
     self.stream3 = Stream(3, self.m3)
     self.cluster1 = StreamCluster(self.stream1)
     self.cluster2 = StreamCluster(self.stream2)
     self.cluster3 = StreamCluster(self.stream3)
Beispiel #11
0
 def createDocumentFromLine(docId, line):
     vector = Vector()
     words = line.split()
     for word in words[1:]:
         if word not in wordToDimensionMap:
             wordToDimensionMap[word] = len(wordToDimensionMap)
         wordDimension = wordToDimensionMap[word]
         if wordDimension not in vector: vector[wordDimension] = 1
         else: vector[wordDimension] += 1
     return Document(docId, vector, clusterId=words[0])
Beispiel #12
0
 def convertTweetJSONToMessage(tweet, **twitter_stream_settings):
     tweetTime = getDateTimeObjectFromTweetTimestamp(tweet['created_at'])
     message = Message(tweet['user']['screen_name'], tweet['id'],
                       tweet['text'], tweetTime)
     message.vector = Vector()
     for phrase in getPhrases(getWordsFromRawEnglishMessage(tweet['text']),
                              twitter_stream_settings['min_phrase_length'],
                              twitter_stream_settings['max_phrase_length']):
         if phrase not in message.vector: message.vector[phrase] = 0
         message.vector[phrase] += 1
     return message
Beispiel #13
0
 def getClusterFromMapFormat(clusterMap):
     dummyMessage = Message(1, '', '', datetime.now())
     dummyMessage.vector = Vector({})
     dummyStream = Stream(1, dummyMessage)
     cluster = StreamCluster(dummyStream)
     cluster.clusterId = clusterMap['clusterId']
     cluster.lastStreamAddedTime = getDateTimeObjectFromTweetTimestamp(
         clusterMap['lastStreamAddedTime'])
     cluster.mergedClustersList = clusterMap['mergedClustersList']
     cluster.documentsInCluster = clusterMap['streams']
     for k, v in clusterMap['dimensions'].iteritems():
         cluster[k] = v
     return cluster
Beispiel #14
0
    def cluster(self, dataIterator):
        i = 1
        for data in dataIterator:
            message = self.convertDataToMessageMethod(data,
                                                      **self.stream_settings)
            #            message = data
            if DataStreamMethods.messageInOrder(message.timeStamp):
                UtilityMethods.updatePhraseTextToPhraseObject(
                    message.vector, message.timeStamp,
                    self.phraseTextToPhraseObjectMap, **self.stream_settings)
                if message.streamId not in self.streamIdToStreamObjectMap:
                    self.streamIdToStreamObjectMap[message.streamId] = Stream(
                        message.streamId, message)
                    self.getClusterAndUpdateExistingClusters(
                        self.streamIdToStreamObjectMap[message.streamId])
                else:
                    previousStreamObject = Vector(
                        vectorInitialValues=self.streamIdToStreamObjectMap[
                            message.streamId])
                    self.streamIdToStreamObjectMap[
                        message.streamId].updateForMessage(
                            message, VectorUpdateMethods.exponentialDecay,
                            **self.stream_settings)
                    streamObject = self.streamIdToStreamObjectMap[
                        message.streamId]
                    distance = Vector.euclideanDistance(
                        streamObject, previousStreamObject)
                    if distance > 10:
                        #                        print i, len(self.clusters), distance
                        self.getClusterAndUpdateExistingClusters(
                            self.streamIdToStreamObjectMap[message.streamId])

                        self.updateDimensionsMethod.call(
                            message.timeStamp,
                            hdStreamClusteringObject=self,
                            currentMessageTime=message.timeStamp)
                        self.clusterFilteringMethod.call(
                            message.timeStamp,
                            hdStreamClusteringObject=self,
                            currentMessageTime=message.timeStamp)

        #                self.clusterAnalysisMethod.call(message.timeStamp, hdStreamClusteringObject=self, currentMessageTime=message.timeStamp)

                    self.clusterAnalysisMethod.call(
                        time.time(),
                        hdStreamClusteringObject=self,
                        currentMessageTime=message.timeStamp,
                        numberOfMessages=i)

#                print i, len(self.clusters)
                i += 1
Beispiel #15
0
def iterateUserDocuments(fileName):
    dataForAggregation = defaultdict(Vector)
    textToIdMap = defaultdict(int)
    for tweet in FileIO.iterateJsonFromFile(fileName):
        textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(
            tweet, **default_experts_twitter_stream_settings).vector
        textIdVector = Vector()
        for phrase in textVector:
            if phrase not in textToIdMap:
                textToIdMap[phrase] = str(len(textToIdMap))
            textIdVector[textToIdMap[phrase]] = textVector[phrase]
        dataForAggregation[tweet['user']
                           ['screen_name'].lower()] += textIdVector
    for k, v in dataForAggregation.iteritems():
        yield k, v
Beispiel #16
0
def iterateTweetUsersAfterCombiningTweets(fileName, **stream_settings):
    dataForAggregation = defaultdict(Vector)
    textToIdMap = defaultdict(int)
    for tweet in TweetFiles.iterateTweetsFromGzip(fileName):
        textVector = TwitterCrowdsSpecificMethods.convertTweetJSONToMessage(
            tweet, **stream_settings).vector
        textIdVector = Vector()
        for phrase in textVector:
            if phrase not in textToIdMap:
                textToIdMap[phrase] = str(len(textToIdMap))
            textIdVector[textToIdMap[phrase]] = textVector[phrase]
        dataForAggregation[tweet['user']
                           ['screen_name'].lower()] += textIdVector
    for k, v in dataForAggregation.iteritems():
        yield k, v
Beispiel #17
0
 def test_setSignatureUsingVectors(self):
     phraseTextAndDimensionMap = TwoWayMap()
     phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'a', 1)
     phraseTextAndDimensionMap.set(TwoWayMap.MAP_FORWARD, 'b', 2)
     documentWithDimensionsInVector = Document(1, {'a': 1, 'b': 4})
     documentWithDimensionsNotInVector = Document(1, {'a': 1, 'c': 4})
     vectors = [
         Vector({
             1: 3 / 5.,
             2: -4 / 5.
         }),
         Vector({
             1: -5 / 13.,
             2: 12 / 13.
         })
     ]
     documentWithDimensionsInVector.setSignatureUsingVectors(
         vectors, phraseTextAndDimensionMap)
     documentWithDimensionsNotInVector.setSignatureUsingVectors(
         vectors, phraseTextAndDimensionMap)
     self.assertEqual(Signature('01'),
                      documentWithDimensionsInVector.signature)
     self.assertEqual(Signature('10'),
                      documentWithDimensionsNotInVector.signature)
Beispiel #18
0
    def plot_points(self, fr=-5.0, to=5.0, values=50, name='cubic'):
        vs = []

        points = np.linspace(fr, to, values, endpoint=True)

        # default cubic function
        # f = self.cubic

        # check if given name exists and if so, let f be function
        if name in self.EQUATIONS:
            f = getattr(Function, name)  # returns the function

        for i in points:
            vs.append(Vector([i, f(0, i)]))
        return vs
@author: kykamath
'''
import sys, os, unittest, cjson
sys.path.append('../../../')
from library.vector import Vector
from itertools import combinations
from experiments.ssa.ssa_sim_mr import SSASimilarityMR
from experiments.ssa.ssa import StreamSimilarityAggregationMR, ItemsClusterer,\
    SimilarStreamAggregation

test_file = 'ssa_test.dat.gz'
test_ssa_threshold = 0.75

vectors =  {
            '1': Vector({'1':4, '2':8}), 
            '2': Vector({'1':4, '2':8}), 
            '3': Vector({'1':4, '2':8}), 
            '4': Vector({'2':8}), 
            '5': Vector({'3':4, '4':8}), 
            '6': Vector({'4':8}), 
            '7': Vector({'3':4, '4':8}), 
            '8': Vector({'3':4}) 
        }
def createTestFile():
    with open(test_file, 'w') as f:
        for v1, v2 in combinations(vectors.iteritems(),2): f.write('%s\t%s\n'%(cjson.encode(['x']), cjson.encode([(v1[0], v1[1]), (v2[0], v2[1])])))

class ItemsClustererTests(unittest.TestCase):
    def setUp(self): self.clusterer = ItemsClusterer()
    def test_addNewCluster(self):
Beispiel #20
0
 def test_exponentialDecay(self):
     VectorUpdateMethods.exponentialDecay(self.s1, self.v1, 0.5, 1)
     self.assertEqual(Vector({1: 3, 2: 1.5, 3: 3}), self.s1)
Beispiel #21
0
 def test_addWithoutDecay(self):
     VectorUpdateMethods.addWithoutDecay(self.s1, self.v1)
     self.assertEqual(Vector({1: 4, 2: 3, 3: 3}), self.s1)
def createDocumentFromLine(docId, line):
    vector, words = Vector(), line.split()
    for word in words[1:]:
        if word not in vector: vector[word] = 1
        else: vector[word] += 1
    return Document(words[0], vector)
Beispiel #23
0
 def _getDocumentFromTuple((user, text)):
     vector, words = Vector(), text.split()
     for word in words[1:]:
         if word not in vector: vector[word] = 1
         else: vector[word] += 1
     return Document(user, vector)
Beispiel #24
0
 def test_updateForMessage_exponentialDecay(self):
     self.stream.updateForMessage(self.m2,
                                  VectorUpdateMethods.exponentialDecay,
                                  **stream_settings)
     self.assertEqual(self.stream, Vector({1: 0.5, 2: 4.5}))
Beispiel #25
0
 def test_updateForMessage_addWithoutDecay(self):
     self.stream.updateForMessage(self.m2,
                                  VectorUpdateMethods.addWithoutDecay,
                                  **stream_settings)
     self.assertEqual(self.stream, Vector({1: 1., 2: 6.}))
Beispiel #26
0
 def add_arrow_vector(self, vector, colour='k', from_vec=Vector([0, 0])):
     self.update_size_if_required(vector)
     arrow_buff = self.get_arrow_buffer()
     self.arrows.append([from_vec, vector, colour])