Example #1
0
 def testComputeOverlapsWithDiagonal(self):
   data = scipy.sparse.csr_matrix([
     [1, 1, 0, 1],
     [0, 1, 1, 0],
     [1, 1, 1, 1]
   ])
   dists = HierarchicalClustering._computeOverlaps(data, selfOverlaps=True)
   self.assertEqual(dists.shape, (6,))
   self.assertEqual(dists.tolist(), [3, 1, 3, 2, 2, 4])
Example #2
0
def create2DSVDProjection(args, protos, trainingData, documentCategoryMap,
                          knn):
    sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn)
    covarianceMatrix = numpy.cov(sparseDataMatrix.toarray(), rowvar=0)
    u, s, v = numpy.linalg.svd(covarianceMatrix)
    projectionMatrix = numpy.dot(u[:, :2], numpy.diag(s[:2]))
    projectedData = sparseDataMatrix.dot(projectionMatrix)

    categoryCounts = []
    for document in documentCategoryMap:
        for category in documentCategoryMap[document]:
            if category >= len(categoryCounts):
                categoryCounts.extend([0] *
                                      (category - len(categoryCounts) + 1))
            categoryCounts[category] += 1
    categoryCounts = numpy.array(categoryCounts)

    colorSequenceBucket = []
    for docId in documentCategoryMap:
        buckets = documentCategoryMap[docId]
        counts = categoryCounts[buckets]
        maxBucketIndex = numpy.argmax(counts)
        maxBucket = buckets[maxBucketIndex]
        colorSequenceBucket.append(maxBucket)

    plt.figure()
    plt.subplot(121, aspect="equal")
    plt.title("Bucket labels (%s)" % (args.modelName, ))
    plt.xlabel("PC 2")
    plt.ylabel("PC 1")
    plt.scatter(projectedData[:, 1],
                projectedData[:, 0],
                c=colorSequenceBucket)

    colorSequenceClusters = numpy.zeros(len(colorSequenceBucket))
    clusterId = 0
    for dataIndices in protos:
        colorSequenceClusters[[d for d in dataIndices if d != -1]] = clusterId
        clusterId += 1

    plt.subplot(122, aspect="equal")
    plt.title("Clusters (%s)" % (args.modelName, ))
    plt.xlabel("PC 2")
    plt.ylabel("PC 1")
    plt.scatter(projectedData[:, 1],
                projectedData[:, 0],
                c=colorSequenceClusters)
    plt.savefig(os.path.join(SAVE_PATH, "scatter.png"))

    plt.figure()
    plt.plot(s[:250])
    plt.xlabel("Singular value #")
    plt.ylabel("Singular value")
    plt.savefig(os.path.join(SAVE_PATH, "singular_values.png"))
Example #3
0
  def testGetPrototypes(self):
    data = scipy.sparse.csr_matrix([
      [1, 1, 0, 1],
      [1, 0, 1, 1],
      [0, 1, 1, 0],
      [1, 1, 1, 1]
    ])
    overlaps = HierarchicalClustering._computeOverlaps(data)

    prototypes = HierarchicalClustering._getPrototypes([0, 1, 2, 3], overlaps)
    self.assertEqual(set(prototypes.tolist()), set([3]))

    prototypes = HierarchicalClustering._getPrototypes([1, 2, 3], overlaps, 2)
    self.assertEqual(set(prototypes.tolist()), set([3, 1]))

    prototypes = HierarchicalClustering._getPrototypes([0, 2, 3], overlaps, 2)
    self.assertEqual(set(prototypes.tolist()), set([3, 0]))

    prototypes = HierarchicalClustering._getPrototypes([0, 1, 2], overlaps, 2)
    self.assertEqual(set(prototypes.tolist()), set([0, 1]))
Example #4
0
  def testCondensedIndex(self):
    flat = range(6)

    # first try only indexing upper triangular region
    indicesA = [0, 0, 0, 1, 1, 2]
    indicesB = [1, 2, 3, 2, 3, 3]
    res = HierarchicalClustering._condensedIndex(indicesA, indicesB, 4)
    self.assertEqual(res.tolist(), flat)

    # ensure we get same result by transposing some indices for the lower
    # triangular region
    indicesA = [0, 2, 3, 1, 3, 2]
    indicesB = [1, 0, 0, 2, 1, 3]
    res = HierarchicalClustering._condensedIndex(indicesA, indicesB, 4)
    self.assertEqual(res.tolist(), flat)

    # finally check that we get an assertion error if we try accessing
    # an element from the diagonal
    with self.assertRaises(AssertionError):
      indicesA = [0, 2, 0, 1, 3, 2]
      indicesB = [1, 2, 3, 2, 1, 3]
      _ = HierarchicalClustering._condensedIndex(indicesA, indicesB, 4)
Example #5
0
  def testExtractVectorsFromKNN(self):
    vectors = numpy.random.rand(10, 25) < 0.1

    # Populate KNN
    knn = KNNClassifier()
    for i in xrange(vectors.shape[0]):
      knn.learn(vectors[i], 0)

    # Extract vectors from KNN
    sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn)

    self.assertEqual(
      sorted(sparseDataMatrix.todense().tolist()), 
      sorted(vectors.tolist())
    )
def create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn):
  sparseDataMatrix = HierarchicalClustering._extractVectorsFromKNN(knn)
  covarianceMatrix = numpy.cov(sparseDataMatrix.toarray(), rowvar=0)
  u, s, v = numpy.linalg.svd(covarianceMatrix)
  projectionMatrix = numpy.dot(u[:,:2], numpy.diag(s[:2]))
  projectedData = sparseDataMatrix.dot(projectionMatrix)

  categoryCounts = []
  for document in documentCategoryMap:
    for category in documentCategoryMap[document]:
      if category >= len(categoryCounts):
        categoryCounts.extend([0] * (category - len(categoryCounts) + 1))
      categoryCounts[category] += 1
  categoryCounts = numpy.array(categoryCounts)

  colorSequenceBucket = []
  for docId in documentCategoryMap:
    buckets = documentCategoryMap[docId]
    counts = categoryCounts[buckets]
    maxBucketIndex = numpy.argmax(counts)
    maxBucket = buckets[maxBucketIndex]
    colorSequenceBucket.append(maxBucket)

  plt.figure()
  plt.subplot(121, aspect="equal")
  plt.title("Bucket labels (%s)" % (args.modelName,))
  plt.xlabel("PC 2")
  plt.ylabel("PC 1")
  plt.scatter(projectedData[:,1], projectedData[:,0], c=colorSequenceBucket)
  
  colorSequenceClusters = numpy.zeros(len(colorSequenceBucket))
  clusterId = 0
  for dataIndices in protos:
    colorSequenceClusters[[d for d in dataIndices if d != -1]] = clusterId
    clusterId += 1
  
  plt.subplot(122, aspect="equal")
  plt.title("Clusters (%s)" % (args.modelName,))
  plt.xlabel("PC 2")
  plt.ylabel("PC 1")
  plt.scatter(projectedData[:,1], projectedData[:,0], c=colorSequenceClusters)
  plt.savefig(os.path.join(SAVE_PATH, "scatter.png"))

  plt.figure()
  plt.plot(s[:250])
  plt.xlabel("Singular value #")
  plt.ylabel("Singular value")
  plt.savefig(os.path.join(SAVE_PATH, "singular_values.png"))
def runExperiment(args):
  if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)
  
  (trainingDataDup, labelRefs, documentCategoryMap,
   documentTextMap) = readDataAndReshuffle(args)
  
  # remove duplicates from training data
  includedDocIds = set()
  trainingData = []
  for record in trainingDataDup:
    if record[2] not in includedDocIds:
      includedDocIds.add(record[2])
      trainingData.append(record)
  
  args.networkConfig = getNetworkConfig(args.networkConfigPath)
  model = createModel(numLabels=1, **vars(args))
  model = trainModel(args, model, trainingData, labelRefs)
  
  numDocs = model.getClassifier()._numPatterns
  
  print "Model trained with %d documents" % (numDocs,)
  
  knn = model.getClassifier()
  hc = HierarchicalClustering(knn)
  
  hc.cluster("complete")
  protos, clusterSizes = hc.getClusterPrototypes(args.numClusters,
                                                 numDocs)

  # Run test to ensure consistency with KNN
  if args.knnTest:
    knnTest(protos, knn)
    return


  # Summary statistics
  # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i
  bucketCounts = numpy.zeros((args.numClusters, len(labelRefs)))  

  for clusterId in xrange(len(clusterSizes)):
    print
    print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId])
    print "==============="

    prototypeNum = 0
    for index in protos[clusterId]:
      if index != -1:
        docId = trainingData[index][2]
        prototypeNum += 1
        display = prototypeNum <= args.numPrototypes

        if display:
          print "(%d) %s" % (docId, trainingData[index][0])
          print "Buckets:"

        # The docId keys in documentCategoryMap are strings rather than ints
        if docId in documentCategoryMap:
          for bucketId in documentCategoryMap[docId]:
            bucketCounts[clusterId, bucketId] += 1
            if display:
              print "    ", labelRefs[bucketId]
        elif display:
          print "    <None>"
        if display:
          print "\n\n"

  createBucketClusterPlot(args, bucketCounts)
  create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)
Example #8
0
def runExperiment(args):
    if not os.path.exists(SAVE_PATH):
        os.makedirs(SAVE_PATH)

    (trainingDataDup, labelRefs, documentCategoryMap,
     documentTextMap) = readDataAndReshuffle(args)

    # remove duplicates from training data
    includedDocIds = set()
    trainingData = []
    for record in trainingDataDup:
        if record[2] not in includedDocIds:
            includedDocIds.add(record[2])
            trainingData.append(record)

    args.networkConfig = getNetworkConfig(args.networkConfigPath)
    model = createModel(numLabels=1, **vars(args))
    model = trainModel(args, model, trainingData, labelRefs)

    numDocs = model.getClassifier()._numPatterns

    print "Model trained with %d documents" % (numDocs, )

    knn = model.getClassifier()
    hc = HierarchicalClustering(knn)

    hc.cluster("complete")
    protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs)

    # Run test to ensure consistency with KNN
    if args.knnTest:
        knnTest(protos, knn)
        return

    # Summary statistics
    # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i
    bucketCounts = numpy.zeros((args.numClusters, len(labelRefs)))

    for clusterId in xrange(len(clusterSizes)):
        print
        print "Cluster %d with %d documents" % (clusterId,
                                                clusterSizes[clusterId])
        print "==============="

        prototypeNum = 0
        for index in protos[clusterId]:
            if index != -1:
                docId = trainingData[index][2]
                prototypeNum += 1
                display = prototypeNum <= args.numPrototypes

                if display:
                    print "(%d) %s" % (docId, trainingData[index][0])
                    print "Buckets:"

                # The docId keys in documentCategoryMap are strings rather than ints
                if docId in documentCategoryMap:
                    for bucketId in documentCategoryMap[docId]:
                        bucketCounts[clusterId, bucketId] += 1
                        if display:
                            print "    ", labelRefs[bucketId]
                elif display:
                    print "    <None>"
                if display:
                    print "\n\n"

    createBucketClusterPlot(args, bucketCounts)
    create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)