def create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn): sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn) covarianceMatrix = numpy.cov(sparseDataMatrix.toarray(), rowvar=0) u, s, v = numpy.linalg.svd(covarianceMatrix) projectionMatrix = numpy.dot(u[:, :2], numpy.diag(s[:2])) projectedData = sparseDataMatrix.dot(projectionMatrix) categoryCounts = [] for document in documentCategoryMap: for category in documentCategoryMap[document]: if category >= len(categoryCounts): categoryCounts.extend([0] * (category - len(categoryCounts) + 1)) categoryCounts[category] += 1 categoryCounts = numpy.array(categoryCounts) colorSequenceBucket = [] for docId in documentCategoryMap: buckets = documentCategoryMap[docId] counts = categoryCounts[buckets] maxBucketIndex = numpy.argmax(counts) maxBucket = buckets[maxBucketIndex] colorSequenceBucket.append(maxBucket) plt.figure() plt.subplot(121, aspect="equal") plt.title("Bucket labels (%s)" % (args.modelName, )) plt.xlabel("PC 2") plt.ylabel("PC 1") plt.scatter(projectedData[:, 1], projectedData[:, 0], c=colorSequenceBucket) colorSequenceClusters = numpy.zeros(len(colorSequenceBucket)) clusterId = 0 for dataIndices in protos: colorSequenceClusters[[d for d in dataIndices if d != -1]] = clusterId clusterId += 1 plt.subplot(122, aspect="equal") plt.title("Clusters (%s)" % (args.modelName, )) plt.xlabel("PC 2") plt.ylabel("PC 1") plt.scatter(projectedData[:, 1], projectedData[:, 0], c=colorSequenceClusters) plt.savefig(os.path.join(SAVE_PATH, "scatter.png")) plt.figure() plt.plot(s[:250]) plt.xlabel("Singular value #") plt.ylabel("Singular value") plt.savefig(os.path.join(SAVE_PATH, "singular_values.png"))
def testExtractVectorsFromKNN(self): vectors = numpy.random.rand(10, 25) < 0.1 # Populate KNN knn = KNNClassifier() for i in xrange(vectors.shape[0]): knn.learn(vectors[i], 0) # Extract vectors from KNN sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn) self.assertEqual( sorted(sparseDataMatrix.todense().tolist()), sorted(vectors.tolist()) )
def create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn): sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn) covarianceMatrix = numpy.cov(sparseDataMatrix.toarray(), rowvar=0) u, s, v = numpy.linalg.svd(covarianceMatrix) projectionMatrix = numpy.dot(u[:,:2], numpy.diag(s[:2])) projectedData = sparseDataMatrix.dot(projectionMatrix) categoryCounts = [] for document in documentCategoryMap: for category in documentCategoryMap[document]: if category >= len(categoryCounts): categoryCounts.extend([0] * (category - len(categoryCounts) + 1)) categoryCounts[category] += 1 categoryCounts = numpy.array(categoryCounts) colorSequenceBucket = [] for docId in documentCategoryMap: buckets = documentCategoryMap[docId] counts = categoryCounts[buckets] maxBucketIndex = numpy.argmax(counts) maxBucket = buckets[maxBucketIndex] colorSequenceBucket.append(maxBucket) plt.figure() plt.subplot(121, aspect="equal") plt.title("Bucket labels (%s)" % (args.modelName,)) plt.xlabel("PC 2") plt.ylabel("PC 1") plt.scatter(projectedData[:,1], projectedData[:,0], c=colorSequenceBucket) colorSequenceClusters = numpy.zeros(len(colorSequenceBucket)) clusterId = 0 for dataIndices in protos: colorSequenceClusters[[d for d in dataIndices if d != -1]] = clusterId clusterId += 1 plt.subplot(122, aspect="equal") plt.title("Clusters (%s)" % (args.modelName,)) plt.xlabel("PC 2") plt.ylabel("PC 1") plt.scatter(projectedData[:,1], projectedData[:,0], c=colorSequenceClusters) plt.savefig(os.path.join(SAVE_PATH, "scatter.png")) plt.figure() plt.plot(s[:250]) plt.xlabel("Singular value #") plt.ylabel("Singular value") plt.savefig(os.path.join(SAVE_PATH, "singular_values.png"))