def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(self.cluster1)
     mergedCluster.mergeCluster(self.cluster2)
     self.assertEqual([self.stream1, self.stream2], list(mergedCluster.iterateDocumentsInCluster()))
     meanVectorForAllDocuments = Vector.getMeanVector([self.stream1, self.stream2])
     self.assertEqual(meanVectorForAllDocuments, mergedCluster)
     self.assertEqual([mergedCluster.docId, mergedCluster.docId], list(doc.clusterId for doc in mergedCluster.iterateDocumentsInCluster()))
     self.assertEqual(self.cluster2.lastStreamAddedTime, mergedCluster.lastStreamAddedTime)
Beispiel #2
0
 def test_mergeCluster_compare_vector_lastStreamAddedTime_more_than_original_cluster(
         self):
     mergedCluster = StreamCluster.getClusterObjectToMergeFrom(
         self.cluster1)
     mergedCluster.mergeCluster(self.cluster2)
     self.assertEqual([self.stream1, self.stream2],
                      list(mergedCluster.iterateDocumentsInCluster()))
     meanVectorForAllDocuments = Vector.getMeanVector(
         [self.stream1, self.stream2])
     self.assertEqual(meanVectorForAllDocuments, mergedCluster)
     self.assertEqual(
         [mergedCluster.docId, mergedCluster.docId],
         list(doc.clusterId
              for doc in mergedCluster.iterateDocumentsInCluster()))
     self.assertEqual(self.cluster2.lastStreamAddedTime,
                      mergedCluster.lastStreamAddedTime)
 def setUp(self):
     self.tweet = {'user':{'screen_name': 'abc'}, 'id':10, 'text':'A project to cluster high-dimensional streams.', 'created_at': 'Tue Mar 01 05:59:59 +0000 2011'}
     m1 = Message(1, '', '', datetime.now())
     m1.vector=Vector({'#tcot':2,'dsf':4})
     self.cluster1 = StreamCluster(Stream(1, m1))
     m2 = Message(2, '', '', datetime.now())
     m2.vector=Vector({'#tcot':4})
     self.cluster2 = StreamCluster(Stream(2, m2))
     m3 = Message(3, '', '', datetime.now())
     m3.vector=Vector(Vector({'#tcot':2}))
     m4 = Message(4, '', '', datetime.now())
     m4.vector=Vector(Vector({'#tcot':2}))
     self.doc1 = Stream(1, m3)
     self.doc2 = Stream(2, m4)
     self.meanVectorForAllDocuments = Vector.getMeanVector([self.cluster1, self.cluster2, self.doc1, self.doc2])
     self.cluster1.addDocument(self.doc1)
     self.cluster2.addDocument(self.doc2)
 def topDimensions(self, numberOfDimensions=10):
     return Vector.getMeanVector(
         self.clusters.itervalues()).getTopDimensions(
             numberOfFeatures=numberOfDimensions)
 def topDimensions(self, numberOfDimensions=10):
     return Vector.getMeanVector(self.clusters.itervalues()).getTopDimensions(numberOfFeatures=numberOfDimensions)
Beispiel #6
0
def offlineLSHClusteringDemo():
    wordToDimensionMap = {}

    def createDocumentFromLine(docId, line):
        vector = Vector()
        words = line.split()
        for word in words[1:]:
            if word not in wordToDimensionMap:
                wordToDimensionMap[word] = len(wordToDimensionMap)
            wordDimension = wordToDimensionMap[word]
            if wordDimension not in vector: vector[wordDimension] = 1
            else: vector[wordDimension] += 1
        return Document(docId, vector, clusterId=words[0])

    dimensions = 53
    signatureLength = 13
    numberOfPermutations = 5

    unitVector = RandomGaussianUnitVector(dimensions=dimensions, mu=0, sigma=1)
    vectorPermutations = VectorPermutation.getPermutations(
        signatureLength, dimensions, unitVector)
    signaturePermutations = [
        SignaturePermutationWithTrie(signatureLength)
        for i in range(numberOfPermutations)
    ]

    permutatedUnitVectors = [
        unitVector.getPermutedVector(r) for r in vectorPermutations
    ]

    # Build LSH Model.
    # Read training documents.
    traningDocumentsMap = {}
    for docId, l in enumerate(
            FileIO.iterateLinesFromFile('../data/train_offline.dat')):
        traningDocumentsMap[docId] = createDocumentFromLine(docId, l)
    # Construct cluster vectors.
    clusterToDocumentsMap = defaultdict(list)
    for document in traningDocumentsMap.values():
        clusterToDocumentsMap[document.clusterId].append(document)
    clusterMap = {}
    for k, v in clusterToDocumentsMap.iteritems():
        clusterMap[k] = Document(docId=k,
                                 vector=Vector.getMeanVector(v),
                                 clusterId=k)

    # Create signatures and signaturePermutations for all the clusters.
    map(
        lambda document: document.setSignatureUsingVectors(
            permutatedUnitVectors), clusterMap.values())
    for permutation in signaturePermutations:
        for document in clusterMap.values():
            permutation.addDocument(document)

    # Testing the model.
    # Read testing documents.
    testDocumentsMap = {}
    for docId, l in enumerate(
            FileIO.iterateLinesFromFile('../data/test_offline.dat')):
        testDocumentsMap[docId] = createDocumentFromLine(docId, l)
    # Create signatures for test documents
    map(
        lambda document: document.setSignatureUsingVectors(
            permutatedUnitVectors), testDocumentsMap.values())

    predicted, labels = [], []
    for t in testDocumentsMap.values():
        possibleNearestClusters = reduce(
            lambda x, y: x.union(y),
            (permutation.getNearestDocuments(t)
             for permutation in signaturePermutations), set())
        predictedClass = max(
            ((clusterId, clusterMap[clusterId].cosineSimilarity(t))
             for clusterId in possibleNearestClusters),
            key=itemgetter(1))
        predicted.append(predictedClass[0])
        labels.append(t.clusterId)
    return EvaluationMetrics.purity(predicted, labels)