コード例 #1
0
def create2DSVDProjection(args, protos, trainingData, documentCategoryMap,
                          knn):
    sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn)
    covarianceMatrix = numpy.cov(sparseDataMatrix.toarray(), rowvar=0)
    u, s, v = numpy.linalg.svd(covarianceMatrix)
    projectionMatrix = numpy.dot(u[:, :2], numpy.diag(s[:2]))
    projectedData = sparseDataMatrix.dot(projectionMatrix)

    categoryCounts = []
    for document in documentCategoryMap:
        for category in documentCategoryMap[document]:
            if category >= len(categoryCounts):
                categoryCounts.extend([0] *
                                      (category - len(categoryCounts) + 1))
            categoryCounts[category] += 1
    categoryCounts = numpy.array(categoryCounts)

    colorSequenceBucket = []
    for docId in documentCategoryMap:
        buckets = documentCategoryMap[docId]
        counts = categoryCounts[buckets]
        maxBucketIndex = numpy.argmax(counts)
        maxBucket = buckets[maxBucketIndex]
        colorSequenceBucket.append(maxBucket)

    plt.figure()
    plt.subplot(121, aspect="equal")
    plt.title("Bucket labels (%s)" % (args.modelName, ))
    plt.xlabel("PC 2")
    plt.ylabel("PC 1")
    plt.scatter(projectedData[:, 1],
                projectedData[:, 0],
                c=colorSequenceBucket)

    colorSequenceClusters = numpy.zeros(len(colorSequenceBucket))
    clusterId = 0
    for dataIndices in protos:
        colorSequenceClusters[[d for d in dataIndices if d != -1]] = clusterId
        clusterId += 1

    plt.subplot(122, aspect="equal")
    plt.title("Clusters (%s)" % (args.modelName, ))
    plt.xlabel("PC 2")
    plt.ylabel("PC 1")
    plt.scatter(projectedData[:, 1],
                projectedData[:, 0],
                c=colorSequenceClusters)
    plt.savefig(os.path.join(SAVE_PATH, "scatter.png"))

    plt.figure()
    plt.plot(s[:250])
    plt.xlabel("Singular value #")
    plt.ylabel("Singular value")
    plt.savefig(os.path.join(SAVE_PATH, "singular_values.png"))
コード例 #2
0
  def testExtractVectorsFromKNN(self):
    vectors = numpy.random.rand(10, 25) < 0.1

    # Populate KNN
    knn = KNNClassifier()
    for i in xrange(vectors.shape[0]):
      knn.learn(vectors[i], 0)

    # Extract vectors from KNN
    sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn)

    self.assertEqual(
      sorted(sparseDataMatrix.todense().tolist()), 
      sorted(vectors.tolist())
    )
コード例 #3
0
def create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn):
  sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn)
  covarianceMatrix = numpy.cov(sparseDataMatrix.toarray(), rowvar=0)
  u, s, v = numpy.linalg.svd(covarianceMatrix)
  projectionMatrix = numpy.dot(u[:,:2], numpy.diag(s[:2]))
  projectedData = sparseDataMatrix.dot(projectionMatrix)

  categoryCounts = []
  for document in documentCategoryMap:
    for category in documentCategoryMap[document]:
      if category >= len(categoryCounts):
        categoryCounts.extend([0] * (category - len(categoryCounts) + 1))
      categoryCounts[category] += 1
  categoryCounts = numpy.array(categoryCounts)

  colorSequenceBucket = []
  for docId in documentCategoryMap:
    buckets = documentCategoryMap[docId]
    counts = categoryCounts[buckets]
    maxBucketIndex = numpy.argmax(counts)
    maxBucket = buckets[maxBucketIndex]
    colorSequenceBucket.append(maxBucket)

  plt.figure()
  plt.subplot(121, aspect="equal")
  plt.title("Bucket labels (%s)" % (args.modelName,))
  plt.xlabel("PC 2")
  plt.ylabel("PC 1")
  plt.scatter(projectedData[:,1], projectedData[:,0], c=colorSequenceBucket)
  
  colorSequenceClusters = numpy.zeros(len(colorSequenceBucket))
  clusterId = 0
  for dataIndices in protos:
    colorSequenceClusters[[d for d in dataIndices if d != -1]] = clusterId
    clusterId += 1
  
  plt.subplot(122, aspect="equal")
  plt.title("Clusters (%s)" % (args.modelName,))
  plt.xlabel("PC 2")
  plt.ylabel("PC 1")
  plt.scatter(projectedData[:,1], projectedData[:,0], c=colorSequenceClusters)
  plt.savefig(os.path.join(SAVE_PATH, "scatter.png"))

  plt.figure()
  plt.plot(s[:250])
  plt.xlabel("Singular value #")
  plt.ylabel("Singular value")
  plt.savefig(os.path.join(SAVE_PATH, "singular_values.png"))