def runExperiment(args): if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH) (trainingDataDup, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # remove duplicates from training data includedDocIds = set() trainingData = [] for record in trainingDataDup: if record[2] not in includedDocIds: includedDocIds.add(record[2]) trainingData.append(record) args.networkConfig = getNetworkConfig(args.networkConfigPath) model = createModel(numLabels=1, **vars(args)) model = trainModel(args, model, trainingData, labelRefs) numDocs = model.getClassifier()._numPatterns print "Model trained with %d documents" % (numDocs,) knn = model.getClassifier() hc = HierarchicalClustering(knn) hc.cluster("complete") protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs) # Run test to ensure consistency with KNN if args.knnTest: knnTest(protos, knn) return # Summary statistics # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i bucketCounts = numpy.zeros((args.numClusters, len(labelRefs))) for clusterId in xrange(len(clusterSizes)): print print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId]) print "===============" prototypeNum = 0 for index in protos[clusterId]: if index != -1: docId = trainingData[index][2] prototypeNum += 1 display = prototypeNum <= args.numPrototypes if display: print "(%d) %s" % (docId, trainingData[index][0]) print "Buckets:" # The docId keys in documentCategoryMap are strings rather than ints if docId in documentCategoryMap: for bucketId in documentCategoryMap[docId]: bucketCounts[clusterId, bucketId] += 1 if display: print " ", labelRefs[bucketId] elif display: print " <None>" if display: print "\n\n" createBucketClusterPlot(args, bucketCounts) create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)
def runExperiment(args): if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH) (trainingDataDup, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # remove duplicates from training data includedDocIds = set() trainingData = [] for record in trainingDataDup: if record[2] not in includedDocIds: includedDocIds.add(record[2]) trainingData.append(record) args.networkConfig = getNetworkConfig(args.networkConfigPath) model = createModel(numLabels=1, **vars(args)) model = trainModel(args, model, trainingData, labelRefs) numDocs = model.getClassifier()._numPatterns print "Model trained with %d documents" % (numDocs, ) knn = model.getClassifier() hc = HierarchicalClustering(knn) hc.cluster("complete") protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs) # Run test to ensure consistency with KNN if args.knnTest: knnTest(protos, knn) return # Summary statistics # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i bucketCounts = numpy.zeros((args.numClusters, len(labelRefs))) for clusterId in xrange(len(clusterSizes)): print print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId]) print "===============" prototypeNum = 0 for index in protos[clusterId]: if index != -1: docId = trainingData[index][2] prototypeNum += 1 display = prototypeNum <= args.numPrototypes if display: print "(%d) %s" % (docId, trainingData[index][0]) print "Buckets:" # The docId keys in documentCategoryMap are strings rather than ints if docId in documentCategoryMap: for bucketId in documentCategoryMap[docId]: bucketCounts[clusterId, bucketId] += 1 if display: print " ", labelRefs[bucketId] elif display: print " <None>" if display: print "\n\n" createBucketClusterPlot(args, bucketCounts) create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)