def testMappingsWithImbuDocumentModel(self): # Create the CioDocumentFingerprint model modelName = "CioDocumentFingerprint" kwargs = { "numLabels": 1, "classifierMetric": "pctOverlapOfInput", "filterText": True, "verbosity": 0, "fingerprintType": EncoderTypes.document, "cacheRoot": None, } model = createModel("CioDocumentFingerprint", **kwargs) # Train the model for use in Imbu for seqId, text in enumerate(self.testDocuments): model.trainDocument(text, [0], seqId) # Query the model, expecting two matches from one sample query = "The key to artificial intelligence has always been the " "representation." _, sortedIds, sortedDistances = model.inferDocument(query, returnDetailedResults=True, sortResults=True) self.assertEqual( len(self.testDocuments), len(sortedIds), "Document-level models should have one prototype ID per document." ) results = self._formatResults(modelName, sortedDistances, sortedIds) for r in results: self.assertEqual(0, r["wordId"], "wordId is insignificant in document-level models, and should be 0.")
def instantiateModel(args): """ Set some specific arguments and return an instance of the model we will use. """ args.networkConfig = getNetworkConfig(args.networkConfigPath) args.k = kValues.get(args.modelName, 1) return createModel(**vars(args))
def testMappingsWithImbuDocumentModel(self): # Create the CioDocumentFingerprint model modelName = "CioDocumentFingerprint" kwargs = { "numLabels": 1, "classifierMetric": "pctOverlapOfInput", "filterText": True, "verbosity": 0, "fingerprintType": EncoderTypes.document, "cacheRoot": None } model = createModel("CioDocumentFingerprint", **kwargs) # Train the model for use in Imbu for seqId, text in enumerate(self.testDocuments): model.trainDocument(text, [0], seqId) # Query the model, expecting two matches from one sample query = ("The key to artificial intelligence has always been the " "representation.") _, sortedIds, sortedDistances = model.inferDocument( query, returnDetailedResults=True, sortResults=True) self.assertEqual( len(self.testDocuments), len(sortedIds), "Document-level models should have one prototype ID per document.") results = self._formatResults(modelName, sortedDistances, sortedIds) for r in results: self.assertEqual( 0, r["wordId"], "wordId is insignificant in document-level models, and should be 0." )
def _executeModelLifecycle(self, modelName, modelDir): """ Create a model, train it, save it, reload it, return it.""" model = createModel(modelName, **self.modelParams) model = trainModel(model, self.dataSet) model.save(modelDir) del model return ClassificationModel.load(modelDir)
def testMappingsWithImbuWordModel(self): # Create a Keywords model modelName = "Keywords" kwargs = {"numLabels": 1, "k": 42, "classifierMetric": "pctOverlapOfInput", "filterText": True, "verbosity": 0} model = createModel(modelName, **kwargs) # Train the model for use in Imbu for seqId, text in enumerate(self.testDocuments): tokenList, mapping = model.tokenize(text) lastTokenIndex = len(tokenList) - 1 for i, (token, tokenIndex) in enumerate(zip(tokenList, mapping)): wordId = seqId * self.tokenIndexingFactor + tokenIndex model.trainToken(token, [0], wordId, reset=int(i == lastTokenIndex)) # Query the model, expecting two matches from one sample query = "The key to artificial intelligence has always been the " "representation." _, sortedIds, sortedDistances = model.inferDocument(query, returnDetailedResults=True, sortResults=True) # Test for expected word-token mapping (in prototype IDs) self.assertItemsEqual( self.filteredProtoIds, sortedIds, "List of IDs returned from inference does not match the expected list of " "prototype IDs.", ) # Test for exact matching results self.assertSequenceEqual( [0.0, 0.0, 1.0], sortedDistances[:3].tolist(), "Expected two exact-matching prototypes." ) # Test for multiple matches per sample results = self._formatResults(modelName, sortedDistances, sortedIds) self.assertEqual(results[0]["sampleId"], results[1]["sampleId"]) self.assertEqual(results[0]["text"], results[1]["text"]) self.assertNotEqual(results[0]["wordId"], results[1]["wordId"]) # Test the match maps back to the query matchingWord = results[0]["text"].split(" ")[results[0]["wordId"]] self.assertIn(matchingWord, query, "Matching word is indexed incorrectly.") # Query the model again, expecting five matches from two samples query = "sequence" _, sortedIds, sortedDistances = model.inferDocument(query, returnDetailedResults=True, sortResults=True) # Test for exact matching results self.assertSequenceEqual( [0.0, 0.0, 0.0, 0.0, 0.0, 1.0], sortedDistances[:6].tolist(), "Expected five exact-matching prototypes." ) # Test the exact matches map back to the query term results = self._formatResults(modelName, sortedDistances, sortedIds) for r in results[:5]: self.assertIn(r["sampleId"], (2, 3)) matchingWord = r["text"].split(" ")[r["wordId"]] self.assertIn(query, matchingWord, "Matching word is indexed incorrectly.")
def instantiateModel(args): """ Return an instance of the model we will use. """ # Some values of K we know work well for this problem for specific model types kValues = {"keywords": 21, "docfp": 3} # Create model after setting specific arguments required for this experiment args.networkConfig = getNetworkConfig(args.networkConfigPath) args.numLabels = 2 args.k = kValues.get(args.modelName, 1) return createModel(**vars(args))
def _modelFactory(self, modelName, savePath, **kwargs): """ Imbu model factory. Returns a concrete instance of a classification model given a model type name and kwargs. @param modelName (str) Must be one of 'CioWordFingerprint', 'CioDocumentFingerprint', 'HTMNetwork', 'Keywords'. """ kwargs.update(modelDir=savePath, **self._defaultModelFactoryKwargs()) modelName = self._mapModelName(modelName) if getattr(ClassificationModelTypes, modelName) in self.requiresCIOKwargs: # Model type requires Cortical.io credentials kwargs.update(retina=self.retina, apiKey=self.apiKey) # Specify encoder params kwargs.update(cacheRoot=self.cacheRoot, retinaScaling=1.0) if modelName == "CioWordFingerprint": kwargs.update(fingerprintType=EncoderTypes.word) elif modelName == "CioDocumentFingerprint": kwargs.update(fingerprintType=EncoderTypes.document) elif modelName == "HTMNetwork": try: kwargs.update(networkConfig=_loadNetworkConfig( kwargs["networkConfigName"])) except Exception as e: print "Could not add params to HTMNetwork model config." raise e elif modelName == "Keywords": # k should be > the number of data samples because the Keywords model # looks for exact matching tokens, so we want to consider all data # samples in the search of k nearest neighbors. kwargs.update(k=10 * len(self.dataDict.keys())) else: raise ValueError( "{} is not an acceptable Imbu model.".format(modelName)) model = createModel(modelName, **kwargs) model.verbosity = 0 return model
def _modelFactory(self, modelName, savePath, **kwargs): """ Imbu model factory. Returns a concrete instance of a classification model given a model type name and kwargs. @param modelName (str) Must be one of 'CioWordFingerprint', 'CioDocumentFingerprint', 'HTMNetwork', 'Keywords'. """ kwargs.update(modelDir=savePath, **self._defaultModelFactoryKwargs()) modelName = self._mapModelName(modelName) if getattr(ClassificationModelTypes, modelName) in self.requiresCIOKwargs: # Model type requires Cortical.io credentials kwargs.update(retina=self.retina, apiKey=self.apiKey) # Specify encoder params kwargs.update(cacheRoot=self.cacheRoot, retinaScaling=1.0) if modelName == "CioWordFingerprint": kwargs.update(fingerprintType=EncoderTypes.word) elif modelName == "CioDocumentFingerprint": kwargs.update(fingerprintType=EncoderTypes.document) elif modelName == "HTMNetwork": try: kwargs.update( networkConfig=_loadNetworkConfig(kwargs["networkConfigName"])) except Exception as e: print "Could not add params to HTMNetwork model config." raise e elif modelName == "Keywords": # k should be > the number of data samples because the Keywords model # looks for exact matching tokens, so we want to consider all data # samples in the search of k nearest neighbors. kwargs.update(k=10 * len(self.dataDict.keys())) else: raise ValueError("{} is not an acceptable Imbu model.".format(modelName)) model = createModel(modelName, **kwargs) model.verbosity = 0 return model
def runExperiment(args): if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH) (trainingDataDup, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # remove duplicates from training data includedDocIds = set() trainingData = [] for record in trainingDataDup: if record[2] not in includedDocIds: includedDocIds.add(record[2]) trainingData.append(record) args.networkConfig = getNetworkConfig(args.networkConfigPath) model = createModel(numLabels=1, **vars(args)) model = trainModel(args, model, trainingData, labelRefs) numDocs = model.getClassifier()._numPatterns print "Model trained with %d documents" % (numDocs,) knn = model.getClassifier() hc = HierarchicalClustering(knn) hc.cluster("complete") protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs) # Run test to ensure consistency with KNN if args.knnTest: knnTest(protos, knn) return # Summary statistics # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i bucketCounts = numpy.zeros((args.numClusters, len(labelRefs))) for clusterId in xrange(len(clusterSizes)): print print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId]) print "===============" prototypeNum = 0 for index in protos[clusterId]: if index != -1: docId = trainingData[index][2] prototypeNum += 1 display = prototypeNum <= args.numPrototypes if display: print "(%d) %s" % (docId, trainingData[index][0]) print "Buckets:" # The docId keys in documentCategoryMap are strings rather than ints if docId in documentCategoryMap: for bucketId in documentCategoryMap[docId]: bucketCounts[clusterId, bucketId] += 1 if display: print " ", labelRefs[bucketId] elif display: print " <None>" if display: print "\n\n" createBucketClusterPlot(args, bucketCounts) create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)
def testMappingsWithImbuWordModel(self): # Create a Keywords model modelName = "Keywords" kwargs = { "numLabels": 1, "k": 42, "classifierMetric": "pctOverlapOfInput", "filterText": True, "verbosity": 0 } model = createModel(modelName, **kwargs) # Train the model for use in Imbu for seqId, text in enumerate(self.testDocuments): tokenList, mapping = model.tokenize(text) lastTokenIndex = len(tokenList) - 1 for i, (token, tokenIndex) in enumerate(zip(tokenList, mapping)): wordId = seqId * self.tokenIndexingFactor + tokenIndex model.trainToken(token, [0], wordId, reset=int(i == lastTokenIndex)) # Query the model, expecting two matches from one sample query = ("The key to artificial intelligence has always been the " "representation.") _, sortedIds, sortedDistances = model.inferDocument( query, returnDetailedResults=True, sortResults=True) # Test for expected word-token mapping (in prototype IDs) self.assertItemsEqual( self.filteredProtoIds, sortedIds, "List of IDs returned from inference does not match the expected list of " "prototype IDs.") # Test for exact matching results self.assertSequenceEqual([0.0, 0.0, 1.0], sortedDistances[:3].tolist(), "Expected two exact-matching prototypes.") # Test for multiple matches per sample results = self._formatResults(modelName, sortedDistances, sortedIds) self.assertEqual(results[0]["sampleId"], results[1]["sampleId"]) self.assertEqual(results[0]["text"], results[1]["text"]) self.assertNotEqual(results[0]["wordId"], results[1]["wordId"]) # Test the match maps back to the query matchingWord = results[0]["text"].split(" ")[results[0]["wordId"]] self.assertIn(matchingWord, query, "Matching word is indexed incorrectly.") # Query the model again, expecting five matches from two samples query = ("sequence") _, sortedIds, sortedDistances = model.inferDocument( query, returnDetailedResults=True, sortResults=True) # Test for exact matching results self.assertSequenceEqual([0.0, 0.0, 0.0, 0.0, 0.0, 1.0], sortedDistances[:6].tolist(), "Expected five exact-matching prototypes.") # Test the exact matches map back to the query term results = self._formatResults(modelName, sortedDistances, sortedIds) for r in results[:5]: self.assertIn(r["sampleId"], (2, 3)) matchingWord = r["text"].split(" ")[r["wordId"]] self.assertIn(query, matchingWord, "Matching word is indexed incorrectly.")
def runExperiment(args): if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH) (trainingDataDup, labelRefs, documentCategoryMap, documentTextMap) = readDataAndReshuffle(args) # remove duplicates from training data includedDocIds = set() trainingData = [] for record in trainingDataDup: if record[2] not in includedDocIds: includedDocIds.add(record[2]) trainingData.append(record) args.networkConfig = getNetworkConfig(args.networkConfigPath) model = createModel(numLabels=1, **vars(args)) model = trainModel(args, model, trainingData, labelRefs) numDocs = model.getClassifier()._numPatterns print "Model trained with %d documents" % (numDocs, ) knn = model.getClassifier() hc = HierarchicalClustering(knn) hc.cluster("complete") protos, clusterSizes = hc.getClusterPrototypes(args.numClusters, numDocs) # Run test to ensure consistency with KNN if args.knnTest: knnTest(protos, knn) return # Summary statistics # bucketCounts[i, j] is the number of occurrances of bucket j in cluster i bucketCounts = numpy.zeros((args.numClusters, len(labelRefs))) for clusterId in xrange(len(clusterSizes)): print print "Cluster %d with %d documents" % (clusterId, clusterSizes[clusterId]) print "===============" prototypeNum = 0 for index in protos[clusterId]: if index != -1: docId = trainingData[index][2] prototypeNum += 1 display = prototypeNum <= args.numPrototypes if display: print "(%d) %s" % (docId, trainingData[index][0]) print "Buckets:" # The docId keys in documentCategoryMap are strings rather than ints if docId in documentCategoryMap: for bucketId in documentCategoryMap[docId]: bucketCounts[clusterId, bucketId] += 1 if display: print " ", labelRefs[bucketId] elif display: print " <None>" if display: print "\n\n" createBucketClusterPlot(args, bucketCounts) create2DSVDProjection(args, protos, trainingData, documentCategoryMap, knn)