def testSaveData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) self.assertTrue(success) dataTable = pandas.read_csv(dataOutputFile).fillna("") types = {"_category": "list", "_token": "string", "_sequenceId": "int", "_reset": "int", "ID": "string"} specials = {"_category": "C", "_token": "", "_sequenceId": "S", "_reset": "R", "ID": ""} expected_records = [record for data in self.expected for record in data] expected_records.insert(0, specials) expected_records.insert(0, types) for idx, values in dataTable.iterrows(): record = values.to_dict() if idx > 1: # csv values are strings, so cast the ints record["_sequenceId"] = int(record["_sequenceId"]) record["_reset"] = int(record["_reset"]) self.assertDictEqual(record, expected_records[idx]) with open(categoriesOutputFile) as f: categories = json.load(f) expected_categories = {"kitchen": 0, "environment": 1, "not helpful": 2} self.assertDictEqual(categories, expected_categories)
def testSplitPreprocess(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") expected = [[{ "_token": "gohbkchoo", "_category": "0 1", "_sequenceId": 0, "ID": "1", "_reset": 1 }], [{ "_token": "o", "_category": "2", "_sequenceId": 1, "ID": "2", "_reset": 1 }, { "_token": "ca", "_category": "2", "_sequenceId": 1, "ID": "2", "_reset": 0 }]] ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True) self.assertRecordsEqual(ndg.records, expected)
def testSaveDataIncorrectType(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.csv") ndg.split(filename, 3, False) with self.assertRaises(TypeError): ndg.saveData(dataOutputFile, categoriesOutputFile)
def testFileRecordStreamReadData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json") ndg.saveData(dataOutputFile, categoriesOutputFile) # If no error is raised, then the data is in the correct format frs = FileRecordStream(dataOutputFile)
def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs): # TODO: use model.prepData()? ndg = NetworkDataGenerator() self.dataDict = ndg.split(filePath=self.dataPath, numLabels=self.numClasses, textPreprocess=preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}_categories.json".format(filename) # Generate one data file for each experiment iteration. if self.experimentType == "k-folds" and not self.orderedSplit: # only randomize the data order once for k-folds cross validation ndg.randomizeData(seed) for i in xrange(splits): if self.experimentType != "k-folds" and not self.orderedSplit: ndg.randomizeData(seed) seed += 1 # ext='.csv' dataFile = "{}_network_{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format(len(self.dataFiles), self.dataFiles) print "Classification JSON is at: {}".format( self.classificationFile)
def _deSerializeExtraData(self, extraDataDir): """ Protected method that is called during deserialization (after __setstate__) with an external directory path. We override it here to load the Network API instance. @param extraDataDir (string) Model's extra data directory path """ self.network = Network(os.path.join(extraDataDir, "network.nta")) self._initializeRegionHelpers() self.networkDataGen = NetworkDataGenerator()
def testSaveData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) self.assertTrue(success) dataTable = pandas.read_csv(dataOutputFile).fillna("") types = { "_category": "list", "_token": "string", "_sequenceId": "int", "_reset": "int", "ID": "string" } specials = { "_category": "C", "_token": "", "_sequenceId": "S", "_reset": "R", "ID": "" } expected_records = [ record for data in self.expected for record in data ] expected_records.insert(0, specials) expected_records.insert(0, types) for idx, values in dataTable.iterrows(): record = values.to_dict() if idx > 1: # csv values are strings, so cast the ints record["_sequenceId"] = int(record["_sequenceId"]) record["_reset"] = int(record["_reset"]) self.assertDictEqual(record, expected_records[idx]) with open(categoriesOutputFile) as f: categories = json.load(f) expected_categories = { "kitchen": 0, "environment": 1, "not helpful": 2 } self.assertDictEqual(categories, expected_categories)
def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs): # TODO: use model.prepData()? ndg = NetworkDataGenerator() self.dataDict = ndg.split( filePath=self.dataPath, numLabels=self.numClasses, textPreprocess=preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}_categories.json".format(filename) # Generate one data file for each experiment iteration. if self.experimentType == "k-folds" and not self.orderedSplit: # only randomize the data order once for k-folds cross validation ndg.randomizeData(seed) for i in xrange(splits): if self.experimentType != "k-folds" and not self.orderedSplit: ndg.randomizeData(seed) seed += 1 # ext='.csv' dataFile = "{}_network_{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format(len(self.dataFiles), self.dataFiles) print "Classification JSON is at: {}".format(self.classificationFile)
def testSplitPreprocess(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") expected = [ [{"_token": "gohbkchoo", "_category": "0 1", "_sequenceId": 0, "ID": "1", "_reset": 1}], [ {"_token": "o", "_category": "2", "_sequenceId": 1, "ID": "2", "_reset": 1}, {"_token": "ca", "_category": "2", "_sequenceId": 1, "ID": "2", "_reset": 0}, ], ] ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True) self.assertRecordsEqual(ndg.records, expected)
def train(self): """ Train the network regions on the entire dataset. There should be one datafile for each training rep in self.dataFiles, where every data sample (i.e. sequence) appears only once in each file. """ # TODO: ignore patterns < minSparsity (= 0.9 * unionSparsity) if self.trainingReps != len(self.dataFiles): raise RuntimeError( "Mismatch between the number of specified training " "reps and the number of data files (should be 1:1).") for dataFile in self.dataFiles: if self.verbosity > 0: print "Running all the data through the network for training..." self.model.swapRecordStream(dataFile) numTokens = NetworkDataGenerator().getNumberOfTokens(dataFile) n = sum(numTokens) self.model.trainNetwork(n) # Populate the classifier space by running through the current data file; # learning (in other regions) is turned off by the model. if self.verbosity > 1: print "Populating the classifier with all of the sequences." self.classifiedSeqIds = self.model.classifyNetwork(n)
def __init__(self, networkConfig, inputFilePath, retinaScaling=1.0, retina="en_associative", apiKey=None, verbosity=1, numLabels=3, modelDir="ClassificationModelHTM", prepData=True, stripCats=False): """ @param networkConfig (dict) Network configuration dict with region parameters. @param inputFilePath (str) Path to data file. @param retinaScaling (float) Scales the dimensions of the SDRs. @param retina (str) Name of Cio retina. @param apiKey (str) Key for Cio API. @param prepData (bool) Prepare the input data into network API format. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. See ClassificationModel for remaining parameters. Note classifierMetric is not specified here as it is in other models. This is done in the network config file. """ super(ClassificationModelHTM, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.networkConfig = networkConfig self.retinaScaling = retinaScaling self.retina = retina self.apiKey = apiKey self.inputFilePath = inputFilePath self.networkDataGen = NetworkDataGenerator() if prepData: self.networkDataPath = self.prepData(self.inputFilePath, stripCats=stripCats) else: self.networkDataPath = self.inputFilePath self.network = self.initModel() self._initializeRegionHelpers()
def _getClassifications(self, iteration): """ Get the classifications for a particular iteration. @param iteration (int) Iteration of the experiment. @return (list) List of list of ids of classifications for a sample. """ dataFile = self.dataFiles[iteration] classifications = NetworkDataGenerator.getClassifications(dataFile) return [[int(c) for c in classes.strip().split(" ")] for classes in classifications]
def partitionIndices(self, seed=42, numInference=10): """ Sets self.partitions for the buckets' querying and ranking sets. The corresponding numbers of tokens for each sequence are stored in self.numTokens. The order of sequences is already specified by the network data files; if generated by the experiment, these are in order or randomized as specified by the orderedSplit arg. """ super(BucketHTMRunner, self).partitionIndices( seed=seed, numInference=numInference) # Get the number of tokens in each bucket file so the network knows how many # iterations to run. The order of buckets in self.bucketFiles is not # necessarily the same ndg = NetworkDataGenerator() for dataFile in self.bucketFiles: self.numTokens.append(ndg.getNumberOfTokens(dataFile))
def partitionIndices(self, _): """ Sets self.partitions for the number of tokens for each sample in the training and test sets. The order of sequences is already specified by the network data files; if generated by the experiment, these are in order or randomized as specified by the orderedSplit arg. """ if self.experimentType == "k-folds": for fold in xrange(self.folds): dataFile = self.dataFiles[fold] numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile) self.partitions = KFolds(self.folds).split(numTokens, randomize=False) else: for trial, split in enumerate(self.trainSizes): dataFile = self.dataFiles[trial] numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile) self.partitions.append((numTokens[:split], numTokens[split:]))
def __init__( self, networkConfig, inputFilePath, retinaScaling=1.0, retina="en_associative", apiKey=None, verbosity=1, numLabels=3, modelDir="ClassificationModelHTM", prepData=True, stripCats=False, cacheRoot=None, ): """ @param networkConfig (dict) Network configuration dict with region parameters. @param inputFilePath (str) Path to data file. @param retinaScaling (float) Scales the dimensions of the SDRs. @param retina (str) Name of Cio retina. @param apiKey (str) Key for Cio API. @param prepData (bool) Prepare the input data into network API format. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. @param cacheRoot (str) Root cache directory for CioEncoder See ClassificationModel for remaining parameters. Note classifierMetric is not specified here as it is in other models. This is done in the network config file. """ super(ClassificationModelHTM, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.networkConfig = networkConfig self.retinaScaling = retinaScaling self.retina = retina self.apiKey = apiKey self.inputFilePath = inputFilePath self.networkDataGen = NetworkDataGenerator() if prepData: self.networkDataPath = self.prepData(self.inputFilePath, stripCats=stripCats) else: self.networkDataPath = self.inputFilePath self.cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.network = self.initModel() self._initializeRegionHelpers()
def testSaveDataIncorrectType(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.csv") ndg.split(filename, 3, False) with self.assertRaises(TypeError): ndg.saveData(dataOutputFile, categoriesOutputFile)
def __init__(self, networkConfig, inputFilePath, retinaScaling=1.0, retina="en_associative", apiKey=None, verbosity=1, numLabels=3, modelDir="ClassificationModelHTM", prepData=True, stripCats=False): """ @param networkConfig (dict) Network configuration dict with region parameters. @param inputFilePath (str) Path to data file. @param retinaScaling (float) Scales the dimensions of the SDRs. @param retina (str) Name of Cio retina. @param apiKey (str) Key for Cio API. @param prepData (bool) Prepare the input data into network API format. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. See ClassificationModel for remaining parameters. """ super(ClassificationModelHTM, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.networkConfig = networkConfig self.retinaScaling = retinaScaling self.retina = retina self.apiKey = apiKey self.networkDataGen = NetworkDataGenerator() if prepData: self.networkDataPath = self.prepData(inputFilePath, stripCats=stripCats) else: self.networkDataPath = inputFilePath self.network = self.initModel() self.learningRegions = self._getLearningRegions() # Always a sensor and classifier region. self.sensorRegion = self.network.regions[ self.networkConfig["sensorRegionConfig"].get("regionName")] self.classifierRegion = self.network.regions[ self.networkConfig["classifierRegionConfig"].get("regionName")]
def testFileRecordStreamReadData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") ndg.saveData(dataOutputFile, categoriesOutputFile) # If no error is raised, then the data is in the correct format frs = FileRecordStream(dataOutputFile)
def testing(self, trial, seed): """ Test the network on the test set for a particular trial and store the results @param trial (int) trial count """ if self.verbosity > 0: i = sum(self.partitions[trial][0]) indices = [] for numTokens in self.partitions[trial][1]: indices.append(i) i += numTokens print( "\tRunner selects to test on sequences starting at indices " "{}".format(indices)) results = ([], []) testIndex = len(self.partitions[trial][0]) for numTokens in self.partitions[trial][1]: predictions = [] activations = [] for _ in xrange(numTokens): predicted, active = self.model.testModel(seed) activations.append(active) predictions.append(predicted) winningPredictions = self._selectWinners(predictions, activations) # TODO: switch to standard (expected, actual) format results[0].append(winningPredictions) results[1].append(self.actualLabels[trial][testIndex]) testIndex += 1 # Prepare data for writeOutClassifications trainIdx = range(len(self.partitions[trial][0])) testIdx = range( len(self.partitions[trial][0]), len(self.partitions[trial][0]) + len(self.partitions[trial][1])) self.partitions[trial] = (trainIdx, testIdx) self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial]) self.results.append(results)
def testing(self, trial, seed): """ Test the network on the test set for a particular trial and store the results @param trial (int) trial count """ if self.verbosity > 0: i = sum(self.partitions[trial][0]) indices = [] for numTokens in self.partitions[trial][1]: indices.append(i) i += numTokens print ("\tRunner selects to test on sequences starting at indices " "{}".format(indices)) results = ([], []) testIndex = len(self.partitions[trial][0]) for numTokens in self.partitions[trial][1]: predictions = [] activations = [] for _ in xrange(numTokens): predicted, active = self.model.testModel(seed) activations.append(active) predictions.append(predicted) winningPredictions = self._selectWinners(predictions, activations) # TODO: switch to standard (expected, actual) format results[0].append(winningPredictions) results[1].append(self.actualLabels[trial][testIndex]) testIndex += 1 # Prepare data for writeOutClassifications trainIdx = range(len(self.partitions[trial][0])) testIdx = range(len(self.partitions[trial][0]), len(self.partitions[trial][0]) + len(self.partitions[trial][1])) self.partitions[trial] = (trainIdx, testIdx) self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial]) self.results.append(results)
def testRandomize(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) random.seed(1) ndg.randomizeData() dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) randomizedIDs = [] dataTable = pandas.read_csv(dataOutputFile) for _, values in dataTable.iterrows(): record = values.to_dict() idx = record["_sequenceId"] if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx): randomizedIDs.append(idx) self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
def testRandomize(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) random.seed(1) ndg.randomizeData() dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) randomizedIDs = [] dataTable = pandas.read_csv(dataOutputFile) for _, values in dataTable.iterrows(): record = values.to_dict() idx = record["_sequenceId"] if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx): randomizedIDs.append(idx) self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
def testSplitNoPreprocess(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) self.assertRecordsEqual(ndg.records, self.expected)
class ClassificationModelHTM(ClassificationModel): """Classify text using generic network-API based models.""" def __init__( self, networkConfig, inputFilePath, retinaScaling=1.0, retina="en_associative", apiKey=None, verbosity=1, numLabels=3, modelDir="ClassificationModelHTM", prepData=True, stripCats=False, cacheRoot=None, ): """ @param networkConfig (dict) Network configuration dict with region parameters. @param inputFilePath (str) Path to data file. @param retinaScaling (float) Scales the dimensions of the SDRs. @param retina (str) Name of Cio retina. @param apiKey (str) Key for Cio API. @param prepData (bool) Prepare the input data into network API format. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. @param cacheRoot (str) Root cache directory for CioEncoder See ClassificationModel for remaining parameters. Note classifierMetric is not specified here as it is in other models. This is done in the network config file. """ super(ClassificationModelHTM, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.networkConfig = networkConfig self.retinaScaling = retinaScaling self.retina = retina self.apiKey = apiKey self.inputFilePath = inputFilePath self.networkDataGen = NetworkDataGenerator() if prepData: self.networkDataPath = self.prepData(self.inputFilePath, stripCats=stripCats) else: self.networkDataPath = self.inputFilePath self.cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__)) self.network = self.initModel() self._initializeRegionHelpers() def getClassifier(self): """ Returns the classifier for the model. """ return self.classifierRegion.getSelf().getAlgorithmInstance() def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs): """ Generate the data in network API format. @param dataPath (str) Path to input data file; format as expected by NetworkDataGenerator. @param ordered (bool) Keep order of data, or randomize. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. @return networkDataPath (str) Path to data formtted for network API. """ networkDataPath = self.networkDataGen.setupData(dataPath, self.numLabels, ordered, stripCats, **kwargs) return networkDataPath def initModel(self): """ Initialize the network; self.networdDataPath must already be set. """ if self.networkDataPath is not None: recordStream = FileRecordStream(streamID=self.networkDataPath) else: recordStream = None encoder = CioEncoder( retinaScaling=self.retinaScaling, cacheDir=os.path.join(self.cacheRoot, "CioCache"), retina=self.retina, apiKey=self.apiKey, verbosity=self.verbosity - 1, ) # This encoder specifies the LanguageSensor output width. return configureNetwork(recordStream, self.networkConfig, encoder) def _initializeRegionHelpers(self): """ Set helper member variables once network has been initialized. This will also be called from _deSerializeExtraData() """ learningRegions = [] for region in self.network.regions.values(): spec = region.getSpec() if spec.parameters.contains("learningMode"): learningRegions.append(region) # Always a sensor and classifier region. self.sensorRegion = self.network.regions[self.networkConfig["sensorRegionConfig"].get("regionName")] self.classifierRegion = self.network.regions[self.networkConfig["classifierRegionConfig"].get("regionName")] # There is sometimes a TP region self.tpRegion = None if self.networkConfig.has_key("tpRegionConfig"): self.tpRegion = self.network.regions[self.networkConfig["tpRegionConfig"].get("regionName")] self.learningRegions = learningRegions self.network.enableProfiling() # TODO: is this still needed? def encodeSample(self, sample): """ Put each token in its own dictionary with its bitmap @param sample (list) Tokenized sample, where each item is a string token. @return (list) The sample text, sparsity, and bitmap for each token. Since the network will do the actual encoding, the bitmap and sparsity will be None Example return list: [{ "text": "Example text", "sparsity": 0.0, "bitmap": None }] """ return [{"text": t, "sparsity": None, "bitmap": None} for t in sample] def resetModel(self): """ Reset the model by creating a new network since the network API does not support resets. """ # TODO: test this works as expected self.network = self.initModel() def saveModel(self, trial=None): try: if not os.path.exists(self.modelDir): os.makedirs(self.modelDir) if trial: netPath = os.path.join(self.modelDir, "network_{}.nta".format(trial)) else: netPath = os.path.join(self.modelDir, "network.nta") self.network.save(netPath) if self.verbosity > 0: print "Model saved to '{}'.".format(netPath) except IOError as e: print "Could not save model to '{}'.".format(netPath) raise e def trainModel(self, iterations=1): """ Run the network with all regions learning. Note self.sampleReference doesn't get populated b/c in a network model there's a 1-to-1 mapping of training samples. """ for region in self.learningRegions: region.setParameter("learningMode", True) self.network.run(iterations) def trainNetwork(self, iterations): """Run the network with all regions learning but the classifier.""" for region in self.learningRegions: if region.name == "classifier": region.setParameter("learningMode", False) else: region.setParameter("learningMode", True) self.network.run(iterations) def classifyNetwork(self, iterations): """ For running after the network has been trained by trainNetwork(), this populates the KNN prototype space with the final network representations. """ for region in self.learningRegions: region.setParameter("learningMode", False) sensor = self.sensorRegion.getSelf() sensor.rewind() self.classifierRegion.setParameter("learningMode", True) self.classifierRegion.setParameter("inferenceMode", True) sequenceIds = [] for _ in xrange(iterations): self.network.run(1) sequenceIds.append(sensor.getOutputValues("sequenceIdOut")[0]) return sequenceIds def inferNetwork(self, iterations, fileRecord=None, learn=False): """ Run the network to infer distances to the classified samples. @param fileRecord (str) If you want to change the file record stream. @param learn (bool) The classifier will learn the inferred sequnce. """ if fileRecord: self.swapRecordStream(fileRecord) self.classifierRegion.setParameter("learningMode", learn) self.classifierRegion.setParameter("inferenceMode", True) sampleDistances = None for i in xrange(iterations): self.network.run(1) inferenceValues = self.classifierRegion.getOutputData("categoriesOut") # Sum together the inferred distances for each word of the sequence. if sampleDistances is None: sampleDistances = inferenceValues else: sampleDistances += inferenceValues return sampleDistances def swapRecordStream(self, dataPath): """Change the data source for the network's sensor region.""" recordStream = FileRecordStream(streamID=dataPath) sensor = self.sensorRegion.getSelf() sensor.dataSource = recordStream # TODO: implement this in network API def testModel(self, seed=42): """ Test the classifier region on the input sample. Call this method for each word of a sequence. The random seed is used in getWinningLabels(). @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ for region in self.learningRegions: region.setParameter("learningMode", False) self.classifierRegion.setParameter("inferenceMode", True) self.network.run(1) inference = self._getClassifierInference(seed) activityBitmap = self.classifierRegion.getInputData("bottomUpIn") return inference, activityBitmap def _getClassifierInference(self, seed): """Return output categories from the classifier region.""" relevantCats = self.classifierRegion.getParameter("categoryCount") if self.classifierRegion.type == "py.KNNClassifierRegion": # max number of inferences = k inferenceValues = self.classifierRegion.getOutputData("categoriesOut")[:relevantCats] return self.getWinningLabels(inferenceValues, seed) elif self.classifierRegion.type == "py.CLAClassifierRegion": # TODO: test this return self.classifierRegion.getOutputData("categoriesOut")[:relevantCats] def queryModel(self, query, preprocess=False): """ Run the query through the network, getting the classifer region's inferences for all words of the query sequence. @return (list) Two-tuples of sequence ID and distance, sorted closest to farthest from the query. """ for region in self.learningRegions: region.setParameter("learningMode", False) self.classifierRegion.setParameter("inferenceMode", True) # Put query text in LanguageSensor data format. queryDicts = self.networkDataGen.generateSequence(query, preprocess) sensor = self.sensorRegion.getSelf() sampleDistances = None for qD in queryDicts: # Sum together the inferred distances for each word of the query sequence. sensor.queue.appendleft(qD) self.network.run(1) inferenceValues = self.classifierRegion.getOutputData("categoriesOut") if sampleDistances is None: sampleDistances = inferenceValues else: sampleDistances += inferenceValues catCount = self.classifierRegion.getParameter("categoryCount") # The use of numpy.lexsort() here is to first sort by randomValues, then # sort by random values; this breaks ties in a random manner. randomValues = numpy.random.random(catCount) sortedSamples = numpy.lexsort((randomValues, sampleDistances[:catCount])) qTuple = [(a, b) for a, b in zip(sortedSamples, sampleDistances[:catCount])] return sorted(qTuple, key=operator.itemgetter(1)) def tokenize(self, text, preprocess=False): """ Given a bunch of text (could be several sentences) return a single list containing individual tokens. Text is tokenized using the CIO tokenize method. @param text (str) A bunch of text. @param preprocess (bool) Whether or not to preprocess the text data. """ encoder = self.sensorRegion.getSelf().encoder sentenceList = encoder.client.tokenize(text) tokenList = [] for sentence in sentenceList: tokenList.extend(sentence.split(",")) return tokenList def reset(self): """ Issue a reset signal to the model. The assumption is that a sequence has just ended and a new sequence is about to begin. The default behavior is to do nothing - not all subclasses may re-implement this. """ # TODO: Introduce a consistent reset method name. for r in self.learningRegions: if r.type == "py.TemporalPoolerRegion": r.executeCommand(["reset"]) elif r.type == "py.TPRegion": r.executeCommand(["resetSequenceStates"]) def trainText(self, token, labels, sequenceId=None, reset=0): """ Train the model with the given text token, associated labels, and sequence ID. @param token (str) The text token to train on @param labels (list) A list of one or more integer labels associated with this token. If the list is empty, the classifier will not be trained. @param sequenceId (int) An integer ID associated with this token and its sequence (document). @param reset (int) Should be 0 or 1. If 1, assumes we are at the beginning of a new sequence. """ for region in self.learningRegions: region.setParameter("learningMode", True) sensor = self.sensorRegion.getSelf() sensor.addDataToQueue(token, labels, sequenceId, 0) self.network.run(1) # Print the outputs of each region if self.verbosity >= 2: self.printRegionOutputs() if reset == 1: self.reset() def classifyText(self, token, reset=0): """ Classify the token and return a list of the best classifications. @param token (str) The text token to train on @param reset (int) Should be 0 or 1. If 1, assumes we are at the end of a sequence. A reset signal will be issued after the model has been trained on this token. @return (numpy array) An array of size numLabels. Position i contains the likelihood that this sample belongs to the i'th category. An array containing all zeros implies no decision could be made. """ for region in self.learningRegions: region.setParameter("learningMode", False) region.setParameter("inferenceMode", True) sensor = self.sensorRegion.getSelf() sensor.addDataToQueue(token, [None], -1, 0) self.network.run(1) # Print the outputs of each region if self.verbosity >= 2: self.printRegionOutputs() if reset == 1: self.reset() return self.classifierRegion.getOutputData("categoriesOut")[0 : self.numLabels] def printRegionOutputs(self): """ Print the outputs of regions to console for debugging, depending on verbosity level. """ print "================== HTM Debugging output:" print "Sensor output:", print self.sensorRegion.getOutputData("dataOut").nonzero() print "Sensor categoryOut:", print self.sensorRegion.getOutputData("categoryOut") if self.verbosity >= 3: if self.tpRegion is not None: print "TP region input:", print self.tpRegion.getInputData("activeCells").nonzero() print "TP region output:", print self.tpRegion.getOutputData("mostActiveCells").nonzero() print "Classifier bottomUpIn: ", print self.classifierRegion.getInputData("bottomUpIn").nonzero() print "Classifier categoryIn: ", print self.classifierRegion.getInputData("categoryIn")[0 : self.numLabels] print "Classifier categoriesOut: ", print self.classifierRegion.getOutputData("categoriesOut")[0 : self.numLabels] print "Classifier categoryProbabilitiesOut", print self.classifierRegion.getOutputData("categoryProbabilitiesOut")[0 : self.numLabels] def dumpProfile(self): """ Print region profiling information in a nice format. """ print "Profiling information for {}".format(type(self).__name__) totalTime = 0.000001 for region in self.network.regions.values(): timer = region.computeTimer totalTime += timer.getElapsed() profileInfo = [] for region in self.network.regions.values(): timer = region.computeTimer profileInfo.append( [region.name, timer.getStartCount(), timer.getElapsed(), 100.0 * timer.getElapsed() / totalTime] ) profileInfo.append(["Total time", "", totalTime, "100.0"]) print tabulate(profileInfo, headers=["Region", "Count", "Elapsed", "Percent of total"], tablefmt="grid") def __getstate__(self): """ Return serializable state. This function will return a version of the __dict__ with data that shouldn't be pickled stripped out. For example, Network API instances are stripped out because they have their own serialization mechanism. See also: _serializeExtraData() """ state = self.__dict__.copy() # Remove member variables that we can't pickle state.pop("network") state.pop("sensorRegion") state.pop("classifierRegion") state.pop("tpRegion") state.pop("learningRegions") state.pop("networkDataGen") return state def _serializeExtraData(self, extraDataDir): """ Protected method that is called during serialization with an external directory path. We override it here to save the Network API instance. @param extraDataDir (string) Model's extra data directory path """ self.network.save(os.path.join(extraDataDir, "network.nta")) def _deSerializeExtraData(self, extraDataDir): """ Protected method that is called during deserialization (after __setstate__) with an external directory path. We override it here to load the Network API instance. @param extraDataDir (string) Model's extra data directory path """ self.network = Network(os.path.join(extraDataDir, "network.nta")) self._initializeRegionHelpers() self.networkDataGen = NetworkDataGenerator()
animals, vegetables = getAnimalVegetableList() vegetable = {} animal = {} tmCellUnion = [] tmInputUnion = [] tpOutput = [] categoryLabel = [] accuracy = [] accuracyTp = [] knnInLastNSequences = 20 knnNumber = 1 plt.close('all') plt.figure(1) plt.show() numTokens = NetworkDataGenerator.getNumberOfTokens(args.dataPath) for numSample in xrange(len(numTokens)): # union SDR for this sequence tmCellActivation = np.zeros((tmRegion._tfdr.cellsPerColumn * tmRegion._tfdr.columnDimensions[0],)) tmInputActivation = np.zeros((tmRegion._tfdr.columnDimensions[0],)) print for word in xrange(numTokens[numSample]): sensorInput = None sensorOutput = {'categoryOut': np.array([0]), 'resetOut': [None], 'sourceOut': None, 'sequenceIdOut': [None], 'encodingOut': None, 'dataOut': np.zeros((sensorRegion.encoder.n, ))} sensorRegion.compute(sensorInput, sensorOutput)
class ClassificationModelHTM(ClassificationModel): """Class to run the classification experiments with HTM network models.""" def __init__(self, networkConfig, inputFilePath, retinaScaling=1.0, retina="en_associative", apiKey=None, verbosity=1, numLabels=3, modelDir="ClassificationModelHTM", prepData=True, stripCats=False): """ @param networkConfig (dict) Network configuration dict with region parameters. @param inputFilePath (str) Path to data file. @param retinaScaling (float) Scales the dimensions of the SDRs. @param retina (str) Name of Cio retina. @param apiKey (str) Key for Cio API. @param prepData (bool) Prepare the input data into network API format. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. See ClassificationModel for remaining parameters. """ super(ClassificationModelHTM, self).__init__( verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.networkConfig = networkConfig self.retinaScaling = retinaScaling self.retina = retina self.apiKey = apiKey self.networkDataGen = NetworkDataGenerator() if prepData: self.networkDataPath = self.prepData(inputFilePath, stripCats=stripCats) else: self.networkDataPath = inputFilePath self.network = self.initModel() self.learningRegions = self._getLearningRegions() # Always a sensor and classifier region. self.sensorRegion = self.network.regions[ self.networkConfig["sensorRegionConfig"].get("regionName")] self.classifierRegion = self.network.regions[ self.networkConfig["classifierRegionConfig"].get("regionName")] def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs): """ Generate the data in network API format. @param dataPath (str) Path to input data file; format as expected by NetworkDataGenerator. @param ordered (bool) Keep order of data, or randomize. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. @return networkDataPath (str) Path to data formtted for network API. """ networkDataPath = self.networkDataGen.setupData( dataPath, self.numLabels, ordered, stripCats, **kwargs) return networkDataPath def initModel(self): """ Initialize the network; self.networdDataPath must already be set. """ recordStream = FileRecordStream(streamID=self.networkDataPath) root = os.path.dirname(os.path.realpath(__file__)) encoder = CioEncoder(retinaScaling=self.retinaScaling, cacheDir=os.path.join(root, "CioCache"), retina=self.retina, apiKey=self.apiKey) # This encoder specifies the LanguageSensor output width. return configureNetwork(recordStream, self.networkConfig, encoder) def _getLearningRegions(self): """Return tuple of the network's region objects that learn.""" learningRegions = [] for region in self.network.regions.values(): spec = region.getSpec() if spec.parameters.contains('learningMode'): learningRegions.append(region) return learningRegions # TODO: is this still needed? def encodeSample(self, sample): """ Put each token in its own dictionary with its bitmap @param sample (list) Tokenized sample, where each item is a string token. @return (list) The sample text, sparsity, and bitmap for each token. Since the network will do the actual encoding, the bitmap and sparsity will be None Example return list: [{ "text": "Example text", "sparsity": 0.0, "bitmap": None }] """ return [{"text": t, "sparsity": None, "bitmap": None} for t in sample] def resetModel(self): """ Reset the model by creating a new network since the network API does not support resets. """ # TODO: test this works as expected self.network = self.initModel() def saveModel(self, trial=None): try: if not os.path.exists(self.modelDir): os.makedirs(self.modelDir) if trial: netPath = os.path.join(self.modelDir, "network_{}.nta".format(trial)) else: netPath = os.path.join(self.modelDir, "network.nta") self.network.save(netPath) # with open(netPath, "wb") as f: # pkl.dump(self, f) if self.verbosity > 0: print "Model saved to '{}'.".format(netPath) except IOError as e: print "Could not save model to '{}'.".format(netPath) raise e def trainModel(self, iterations=1): """ Run the network with all regions learning. Note self.sampleReference doesn't get populated b/c in a network model there's a 1-to-1 mapping of training samples. """ for region in self.learningRegions: region.setParameter("learningMode", True) self.network.run(iterations) def trainNetwork(self, iterations): """Run the network with all regions learning but the classifier.""" for region in self.learningRegions: if region.name == "classifier": region.setParameter("learningMode", False) else: region.setParameter("learningMode", True) self.network.run(iterations) def classifyNetwork(self, iterations): """ For running after the network has been trained by trainNetwork(), this populates the KNN prototype space with the final network representations. """ for region in self.learningRegions: region.setParameter("learningMode", False) sensor = self.sensorRegion.getSelf() sensor.rewind() self.classifierRegion.setParameter("learningMode", True) self.classifierRegion.setParameter("inferenceMode", True) sequenceIds = [] for _ in xrange(iterations): self.network.run(1) sequenceIds.append(sensor.getOutputValues("sequenceIdOut")[0]) return sequenceIds def inferNetwork(self, iterations, fileRecord=None, learn=False): """ Run the network to infer distances to the classified samples. @param fileRecord (str) If you want to change the file record stream. @param learn (bool) The classifier will learn the inferred sequnce. """ if fileRecord: self.swapRecordStream(fileRecord) self.classifierRegion.setParameter("learningMode", learn) self.classifierRegion.setParameter("inferenceMode", True) sampleDistances = None for i in xrange(iterations): self.network.run(1) inferenceValues = self.classifierRegion.getOutputData("categoriesOut") # Sum together the inferred distances for each word of the sequence. if sampleDistances is None: sampleDistances = inferenceValues else: sampleDistances += inferenceValues return sampleDistances def swapRecordStream(self, dataPath): """Change the data source for the network's sensor region.""" recordStream = FileRecordStream(streamID=dataPath) sensor = self.sensorRegion.getSelf() sensor.dataSource = recordStream # TODO: implement this in network API def testModel(self, seed=42): """ Test the classifier region on the input sample. Call this method for each word of a sequence. The random seed is used in getWinningLabels(). @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ for region in self.learningRegions: region.setParameter("learningMode", False) self.classifierRegion.setParameter("inferenceMode", True) self.network.run(1) inference = self._getClassifierInference(seed) activityBitmap = self.classifierRegion.getInputData("bottomUpIn") return inference, activityBitmap def _getClassifierInference(self, seed): """Return output categories from the classifier region.""" relevantCats = self.classifierRegion.getParameter("categoryCount") if self.classifierRegion.type == "py.KNNClassifierRegion": # max number of inferences = k inferenceValues = self.classifierRegion.getOutputData( "categoriesOut")[:relevantCats] return self.getWinningLabels(inferenceValues, seed) elif self.classifierRegion.type == "py.CLAClassifierRegion": # TODO: test this return self.classifierRegion.getOutputData("categoriesOut")[:relevantCats] def queryModel(self, query, preprocess=False): """ Run the query through the network, getting the classifer region's inferences for all words of the query sequence. @return (list) Two-tuples of sequence ID and distance, sorted closest to farthest from the query. """ for region in self.learningRegions: region.setParameter("learningMode", False) self.classifierRegion.setParameter("inferenceMode", True) # Put query text in LanguageSensor data format. queryDicts = self.networkDataGen.generateSequence(query, preprocess) sensor = self.sensorRegion.getSelf() sampleDistances = None for qD in queryDicts: # Sum together the inferred distances for each word of the query sequence. sensor.queue.appendleft(qD) self.network.run(1) inferenceValues = self.classifierRegion.getOutputData("categoriesOut") if sampleDistances is None: sampleDistances = inferenceValues else: sampleDistances += inferenceValues catCount = self.classifierRegion.getParameter("categoryCount") # The use of numpy.lexsort() here is to first sort by randomValues, then # sort by random values; this breaks ties in a random manner. randomValues = numpy.random.random(catCount) sortedSamples = numpy.lexsort((randomValues, sampleDistances[:catCount])) qTuple = [(a, b) for a, b in zip(sortedSamples, sampleDistances[:catCount])] return sorted(qTuple, key=operator.itemgetter(1))
def setupNetData( self, generateData=True, seed=42, preprocess=False, **kwargs): """ Resulting network data files created: - One for each bucket - One for each training rep, where samples are not repeated in a given file. Each samples is given its own category (_category = _sequenceId). The classification json is saved when generating the final training file. """ if generateData: ndg = NetworkDataGenerator() self.dataDict = ndg.split( filePath=self.dataPath, numLabels=1, textPreprocess=preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}_categories.json".format(filename) # Generate test data files: one network data file for each bucket. bucketFilePaths = bucketCSVs(self.dataPath) for bucketFile in bucketFilePaths: ndg.reset() ndg.split( filePath=bucketFile, numLabels=1, textPreprocess=preprocess, **kwargs) bucketFileName, ext = os.path.splitext(bucketFile) if not self.orderedSplit: # the sequences will be written to the file in random order ndg.randomizeData(seed) dataFile = "{}_network{}".format(bucketFileName, ext) ndg.saveData(dataFile, self.classificationFile) # the classification file here gets (correctly) overwritten later self.bucketFiles.append(dataFile) # Generate training data file(s). self.trainingDicts = [] uniqueDataDict = OrderedDict() included = [] seqID = 0 for dataEntry in self.dataDict.values(): uniqueID = dataEntry[2] if uniqueID not in included: # skip over the samples that are repeated in multiple buckets uniqueDataDict[seqID] = dataEntry included.append(uniqueID) seqID += 1 self.trainingDicts.append(uniqueDataDict) ndg.reset() ndg.split( dataDict=uniqueDataDict, numLabels=1, textPreprocess=preprocess, **kwargs) for rep in xrange(self.trainingReps): # use a different file for each training rep if not self.orderedSplit: ndg.randomizeData(seed) ndg.stripCategories() # replace the categories w/ seqId dataFile = "{}_network_training_{}{}".format(filename, rep, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) # TODO: maybe add a method (and arg) for removing all these data files else: # TODO (only if needed) raise NotImplementedError("Must generate data.") # labels references match the classification json self.mapLabelRefs()
class ClassificationModelHTM(ClassificationModel): """Classify text using generic network-API based models.""" def __init__(self, networkConfig, inputFilePath, retinaScaling=1.0, retina="en_associative", apiKey=None, verbosity=1, numLabels=3, modelDir="ClassificationModelHTM", prepData=True, stripCats=False): """ @param networkConfig (dict) Network configuration dict with region parameters. @param inputFilePath (str) Path to data file. @param retinaScaling (float) Scales the dimensions of the SDRs. @param retina (str) Name of Cio retina. @param apiKey (str) Key for Cio API. @param prepData (bool) Prepare the input data into network API format. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. See ClassificationModel for remaining parameters. Note classifierMetric is not specified here as it is in other models. This is done in the network config file. """ super(ClassificationModelHTM, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir) self.networkConfig = networkConfig self.retinaScaling = retinaScaling self.retina = retina self.apiKey = apiKey self.inputFilePath = inputFilePath self.networkDataGen = NetworkDataGenerator() if prepData: self.networkDataPath = self.prepData(self.inputFilePath, stripCats=stripCats) else: self.networkDataPath = self.inputFilePath self.network = self.initModel() self._initializeRegionHelpers() def getClassifier(self): """ Returns the classifier for the model. """ return self.classifierRegion.getSelf().getAlgorithmInstance() def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs): """ Generate the data in network API format. @param dataPath (str) Path to input data file; format as expected by NetworkDataGenerator. @param ordered (bool) Keep order of data, or randomize. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. @return networkDataPath (str) Path to data formtted for network API. """ networkDataPath = self.networkDataGen.setupData( dataPath, self.numLabels, ordered, stripCats, **kwargs) return networkDataPath def initModel(self): """ Initialize the network; self.networdDataPath must already be set. """ if self.networkDataPath is not None: recordStream = FileRecordStream(streamID=self.networkDataPath) else: recordStream = None root = os.path.dirname(os.path.realpath(__file__)) encoder = CioEncoder(retinaScaling=self.retinaScaling, cacheDir=os.path.join(root, "CioCache"), retina=self.retina, apiKey=self.apiKey) # This encoder specifies the LanguageSensor output width. return configureNetwork(recordStream, self.networkConfig, encoder) def _initializeRegionHelpers(self): """ Set helper member variables once network has been initialized. This will also be called from _deSerializeExtraData() """ learningRegions = [] for region in self.network.regions.values(): spec = region.getSpec() if spec.parameters.contains('learningMode'): learningRegions.append(region) # Always a sensor and classifier region. self.sensorRegion = self.network.regions[ self.networkConfig["sensorRegionConfig"].get("regionName")] self.classifierRegion = self.network.regions[ self.networkConfig["classifierRegionConfig"].get("regionName")] # There is sometimes a TP region self.tpRegion = None if self.networkConfig.has_key("tpRegionConfig"): self.tpRegion = self.network.regions[ self.networkConfig["tpRegionConfig"].get("regionName")] self.learningRegions = learningRegions # TODO: is this still needed? def encodeSample(self, sample): """ Put each token in its own dictionary with its bitmap @param sample (list) Tokenized sample, where each item is a string token. @return (list) The sample text, sparsity, and bitmap for each token. Since the network will do the actual encoding, the bitmap and sparsity will be None Example return list: [{ "text": "Example text", "sparsity": 0.0, "bitmap": None }] """ return [{"text": t, "sparsity": None, "bitmap": None} for t in sample] def resetModel(self): """ Reset the model by creating a new network since the network API does not support resets. """ # TODO: test this works as expected self.network = self.initModel() def saveModel(self, trial=None): try: if not os.path.exists(self.modelDir): os.makedirs(self.modelDir) if trial: netPath = os.path.join(self.modelDir, "network_{}.nta".format(trial)) else: netPath = os.path.join(self.modelDir, "network.nta") self.network.save(netPath) if self.verbosity > 0: print "Model saved to '{}'.".format(netPath) except IOError as e: print "Could not save model to '{}'.".format(netPath) raise e def trainModel(self, iterations=1): """ Run the network with all regions learning. Note self.sampleReference doesn't get populated b/c in a network model there's a 1-to-1 mapping of training samples. """ for region in self.learningRegions: region.setParameter("learningMode", True) self.network.run(iterations) def trainNetwork(self, iterations): """Run the network with all regions learning but the classifier.""" for region in self.learningRegions: if region.name == "classifier": region.setParameter("learningMode", False) else: region.setParameter("learningMode", True) self.network.run(iterations) def classifyNetwork(self, iterations): """ For running after the network has been trained by trainNetwork(), this populates the KNN prototype space with the final network representations. """ for region in self.learningRegions: region.setParameter("learningMode", False) sensor = self.sensorRegion.getSelf() sensor.rewind() self.classifierRegion.setParameter("learningMode", True) self.classifierRegion.setParameter("inferenceMode", True) sequenceIds = [] for _ in xrange(iterations): self.network.run(1) sequenceIds.append(sensor.getOutputValues("sequenceIdOut")[0]) return sequenceIds def inferNetwork(self, iterations, fileRecord=None, learn=False): """ Run the network to infer distances to the classified samples. @param fileRecord (str) If you want to change the file record stream. @param learn (bool) The classifier will learn the inferred sequnce. """ if fileRecord: self.swapRecordStream(fileRecord) self.classifierRegion.setParameter("learningMode", learn) self.classifierRegion.setParameter("inferenceMode", True) sampleDistances = None for i in xrange(iterations): self.network.run(1) inferenceValues = self.classifierRegion.getOutputData( "categoriesOut") # Sum together the inferred distances for each word of the sequence. if sampleDistances is None: sampleDistances = inferenceValues else: sampleDistances += inferenceValues return sampleDistances def swapRecordStream(self, dataPath): """Change the data source for the network's sensor region.""" recordStream = FileRecordStream(streamID=dataPath) sensor = self.sensorRegion.getSelf() sensor.dataSource = recordStream # TODO: implement this in network API def testModel(self, seed=42): """ Test the classifier region on the input sample. Call this method for each word of a sequence. The random seed is used in getWinningLabels(). @return (numpy array) numLabels most-frequent classifications for the data samples; int or empty. """ for region in self.learningRegions: region.setParameter("learningMode", False) self.classifierRegion.setParameter("inferenceMode", True) self.network.run(1) inference = self._getClassifierInference(seed) activityBitmap = self.classifierRegion.getInputData("bottomUpIn") return inference, activityBitmap def _getClassifierInference(self, seed): """Return output categories from the classifier region.""" relevantCats = self.classifierRegion.getParameter("categoryCount") if self.classifierRegion.type == "py.KNNClassifierRegion": # max number of inferences = k inferenceValues = self.classifierRegion.getOutputData( "categoriesOut")[:relevantCats] return self.getWinningLabels(inferenceValues, seed) elif self.classifierRegion.type == "py.CLAClassifierRegion": # TODO: test this return self.classifierRegion.getOutputData( "categoriesOut")[:relevantCats] def queryModel(self, query, preprocess=False): """ Run the query through the network, getting the classifer region's inferences for all words of the query sequence. @return (list) Two-tuples of sequence ID and distance, sorted closest to farthest from the query. """ for region in self.learningRegions: region.setParameter("learningMode", False) self.classifierRegion.setParameter("inferenceMode", True) # Put query text in LanguageSensor data format. queryDicts = self.networkDataGen.generateSequence(query, preprocess) sensor = self.sensorRegion.getSelf() sampleDistances = None for qD in queryDicts: # Sum together the inferred distances for each word of the query sequence. sensor.queue.appendleft(qD) self.network.run(1) inferenceValues = self.classifierRegion.getOutputData( "categoriesOut") if sampleDistances is None: sampleDistances = inferenceValues else: sampleDistances += inferenceValues catCount = self.classifierRegion.getParameter("categoryCount") # The use of numpy.lexsort() here is to first sort by randomValues, then # sort by random values; this breaks ties in a random manner. randomValues = numpy.random.random(catCount) sortedSamples = numpy.lexsort( (randomValues, sampleDistances[:catCount])) qTuple = [(a, b) for a, b in zip(sortedSamples, sampleDistances[:catCount])] return sorted(qTuple, key=operator.itemgetter(1)) def reset(self): """ Issue a reset signal to the model. The assumption is that a sequence has just ended and a new sequence is about to begin. The default behavior is to do nothing - not all subclasses may re-implement this. """ # TODO: Introduce a consistent reset method name. for r in self.learningRegions: if r.type == 'py.TemporalPoolerRegion': r.executeCommand(['reset']) elif r.type == 'py.TPRegion': r.executeCommand(['resetSequenceStates']) def trainText(self, token, labels, sequenceId=None, reset=0): """ Train the model with the given text token, associated labels, and sequence ID. @param token (str) The text token to train on @param labels (list) A list of one or more integer labels associated with this token. If the list is empty, the classifier will not be trained. @param sequenceId (int) An integer ID associated with this token and its sequence (document). @param reset (int) Should be 0 or 1. If 1, assumes we are at the beginning of a new sequence. """ for region in self.learningRegions: region.setParameter("learningMode", True) sensor = self.sensorRegion.getSelf() sensor.addDataToQueue(token, labels, sequenceId, 0) self.network.run(1) # Print the outputs of each region if self.verbosity >= 2: self.printRegionOutputs() if reset == 1: self.reset() def classifyText(self, token, reset=0): """ Classify the token and return a list of the best classifications. @param token (str) The text token to train on @param reset (int) Should be 0 or 1. If 1, assumes we are at the end of a sequence. A reset signal will be issued after the model has been trained on this token. @return (numpy array) An array of size numLabels. Position i contains the likelihood that this sample belongs to the i'th category. An array containing all zeros implies no decision could be made. """ for region in self.learningRegions: region.setParameter("learningMode", False) region.setParameter("inferenceMode", True) sensor = self.sensorRegion.getSelf() sensor.addDataToQueue(token, [None], -1, 0) self.network.run(1) # Print the outputs of each region if self.verbosity >= 2: self.printRegionOutputs() if reset == 1: self.reset() return self.classifierRegion.getOutputData( "categoriesOut")[0:self.numLabels] def printRegionOutputs(self): """ Print the outputs of regions to console for debugging, depending on verbosity level. """ print "================== HTM Debugging output:" print "Sensor output:", print self.sensorRegion.getOutputData("dataOut").nonzero() print "Sensor categoryOut:", print self.sensorRegion.getOutputData("categoryOut") if self.verbosity >= 3: if self.tpRegion is not None: print "TP region input:", print self.tpRegion.getInputData("activeCells").nonzero() print "TP region output:", print self.tpRegion.getOutputData("mostActiveCells").nonzero() print "Classifier bottomUpIn: ", print self.classifierRegion.getInputData("bottomUpIn").nonzero() print "Classifier categoryIn: ", print self.classifierRegion.getInputData( "categoryIn")[0:self.numLabels] print "Classifier categoriesOut: ", print self.classifierRegion.getOutputData( "categoriesOut")[0:self.numLabels] print "Classifier categoryProbabilitiesOut", print self.classifierRegion.getOutputData( "categoryProbabilitiesOut")[0:self.numLabels] def __getstate__(self): """ Return serializable state. This function will return a version of the __dict__ with data that shouldn't be pickled stripped out. For example, Network API instances are stripped out because they have their own serialization mechanism. See also: _serializeExtraData() """ state = self.__dict__.copy() # Remove member variables that we can't pickle state.pop("network") state.pop("sensorRegion") state.pop("classifierRegion") state.pop("tpRegion") state.pop("learningRegions") state.pop("networkDataGen") return state def _serializeExtraData(self, extraDataDir): """ Protected method that is called during serialization with an external directory path. We override it here to save the Network API instance. @param extraDataDir (string) Model's extra data directory path """ self.network.save(os.path.join(extraDataDir, "network.nta")) def _deSerializeExtraData(self, extraDataDir): """ Protected method that is called during deserialization (after __setstate__) with an external directory path. We override it here to load the Network API instance. @param extraDataDir (string) Model's extra data directory path """ self.network = Network(os.path.join(extraDataDir, "network.nta")) self._initializeRegionHelpers() self.networkDataGen = NetworkDataGenerator()