def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {"_category": "list", "_token": "string", "_sequenceId": "int", "_reset": "int", "ID": "string"}
        specials = {"_category": "C", "_token": "", "_sequenceId": "S", "_reset": "R", "ID": ""}

        expected_records = [record for data in self.expected for record in data]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if idx > 1:
                # csv values are strings, so cast the ints
                record["_sequenceId"] = int(record["_sequenceId"])
                record["_reset"] = int(record["_reset"])
            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {"kitchen": 0, "environment": 1, "not helpful": 2}
        self.assertDictEqual(categories, expected_categories)
Example #2
0
    def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        expected = [[{
            "_token": "gohbkchoo",
            "_category": "0 1",
            "_sequenceId": 0,
            "ID": "1",
            "_reset": 1
        }],
                    [{
                        "_token": "o",
                        "_category": "2",
                        "_sequenceId": 1,
                        "ID": "2",
                        "_reset": 1
                    }, {
                        "_token": "ca",
                        "_category": "2",
                        "_sequenceId": 1,
                        "ID": "2",
                        "_reset": 0
                    }]]

        ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.csv")
        ndg.split(filename, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
    def testFileRecordStreamReadData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json")
        ndg.saveData(dataOutputFile, categoriesOutputFile)

        # If no error is raised, then the data is in the correct format
        frs = FileRecordStream(dataOutputFile)
Example #5
0
    def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs):
        # TODO: use model.prepData()?
        ndg = NetworkDataGenerator()
        self.dataDict = ndg.split(filePath=self.dataPath,
                                  numLabels=self.numClasses,
                                  textPreprocess=preprocess,
                                  **kwargs)

        filename, ext = os.path.splitext(self.dataPath)
        self.classificationFile = "{}_categories.json".format(filename)

        # Generate one data file for each experiment iteration.
        if self.experimentType == "k-folds" and not self.orderedSplit:
            # only randomize the data order once for k-folds cross validation
            ndg.randomizeData(seed)
        for i in xrange(splits):
            if self.experimentType != "k-folds" and not self.orderedSplit:
                ndg.randomizeData(seed)
                seed += 1
            # ext='.csv'
            dataFile = "{}_network_{}{}".format(filename, i, ext)
            ndg.saveData(dataFile, self.classificationFile)
            self.dataFiles.append(dataFile)

        if self.verbosity > 0:
            print "{} file(s) generated at {}".format(len(self.dataFiles),
                                                      self.dataFiles)
            print "Classification JSON is at: {}".format(
                self.classificationFile)
Example #6
0
    def _deSerializeExtraData(self, extraDataDir):
        """
    Protected method that is called during deserialization (after __setstate__)
    with an external directory path. We override it here to load the Network API
    instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network = Network(os.path.join(extraDataDir, "network.nta"))
        self._initializeRegionHelpers()
        self.networkDataGen = NetworkDataGenerator()
Example #7
0
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_category": "list",
            "_token": "string",
            "_sequenceId": "int",
            "_reset": "int",
            "ID": "string"
        }
        specials = {
            "_category": "C",
            "_token": "",
            "_sequenceId": "S",
            "_reset": "R",
            "ID": ""
        }

        expected_records = [
            record for data in self.expected for record in data
        ]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if idx > 1:
                # csv values are strings, so cast the ints
                record["_sequenceId"] = int(record["_sequenceId"])
                record["_reset"] = int(record["_reset"])
            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        }
        self.assertDictEqual(categories, expected_categories)
  def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs):
    # TODO: use model.prepData()?
    ndg = NetworkDataGenerator()
    self.dataDict = ndg.split(
      filePath=self.dataPath, numLabels=self.numClasses, textPreprocess=preprocess, **kwargs)

    filename, ext = os.path.splitext(self.dataPath)
    self.classificationFile = "{}_categories.json".format(filename)

    # Generate one data file for each experiment iteration.
    if self.experimentType == "k-folds" and not self.orderedSplit:
      # only randomize the data order once for k-folds cross validation
      ndg.randomizeData(seed)
    for i in xrange(splits):
      if self.experimentType != "k-folds" and not self.orderedSplit:
        ndg.randomizeData(seed)
        seed += 1
      # ext='.csv'
      dataFile = "{}_network_{}{}".format(filename, i, ext)
      ndg.saveData(dataFile, self.classificationFile)
      self.dataFiles.append(dataFile)

    if self.verbosity > 0:
      print "{} file(s) generated at {}".format(len(self.dataFiles),
        self.dataFiles)
      print "Classification JSON is at: {}".format(self.classificationFile)
    def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        expected = [
            [{"_token": "gohbkchoo", "_category": "0 1", "_sequenceId": 0, "ID": "1", "_reset": 1}],
            [
                {"_token": "o", "_category": "2", "_sequenceId": 1, "ID": "2", "_reset": 1},
                {"_token": "ca", "_category": "2", "_sequenceId": 1, "ID": "2", "_reset": 0},
            ],
        ]

        ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
Example #10
0
    def train(self):
        """
    Train the network regions on the entire dataset.
    There should be one datafile for each training rep in self.dataFiles, where
    every data sample (i.e. sequence) appears only once in each file.
    """
        # TODO: ignore patterns < minSparsity (= 0.9 * unionSparsity)
        if self.trainingReps != len(self.dataFiles):
            raise RuntimeError(
                "Mismatch between the number of specified training "
                "reps and the number of data files (should be 1:1).")

        for dataFile in self.dataFiles:
            if self.verbosity > 0:
                print "Running all the data through the network for training..."
            self.model.swapRecordStream(dataFile)
            numTokens = NetworkDataGenerator().getNumberOfTokens(dataFile)
            n = sum(numTokens)
            self.model.trainNetwork(n)

        # Populate the classifier space by running through the current data file;
        # learning (in other regions) is turned off by the model.
        if self.verbosity > 1:
            print "Populating the classifier with all of the sequences."
        self.classifiedSeqIds = self.model.classifyNetwork(n)
Example #11
0
    def __init__(self,
                 networkConfig,
                 inputFilePath,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelHTM",
                 prepData=True,
                 stripCats=False):
        """
    @param networkConfig      (dict)    Network configuration dict with region
                                        parameters.
    @param inputFilePath      (str)     Path to data file.
    @param retinaScaling      (float)   Scales the dimensions of the SDRs.
    @param retina             (str)     Name of Cio retina.
    @param apiKey             (str)     Key for Cio API.
    @param prepData           (bool)    Prepare the input data into network API
                                        format.
    @param stripCats          (bool)    Remove the categories and replace them
                                        with the sequence_Id.
    See ClassificationModel for remaining parameters.

    Note classifierMetric is not specified here as it is in other models. This
    is done in the network config file.
    """
        super(ClassificationModelHTM, self).__init__(verbosity=verbosity,
                                                     numLabels=numLabels,
                                                     modelDir=modelDir)

        self.networkConfig = networkConfig
        self.retinaScaling = retinaScaling
        self.retina = retina
        self.apiKey = apiKey
        self.inputFilePath = inputFilePath

        self.networkDataGen = NetworkDataGenerator()
        if prepData:
            self.networkDataPath = self.prepData(self.inputFilePath,
                                                 stripCats=stripCats)
        else:
            self.networkDataPath = self.inputFilePath

        self.network = self.initModel()
        self._initializeRegionHelpers()
Example #12
0
 def _getClassifications(self, iteration):
     """
 Get the classifications for a particular iteration.
 @param iteration  (int)       Iteration of the experiment.
 @return           (list)      List of list of ids of classifications for a
                               sample.
 """
     dataFile = self.dataFiles[iteration]
     classifications = NetworkDataGenerator.getClassifications(dataFile)
     return [[int(c) for c in classes.strip().split(" ")] for classes in classifications]
Example #13
0
  def partitionIndices(self, seed=42, numInference=10):
    """
    Sets self.partitions for the buckets' querying and ranking sets. The
    corresponding numbers of tokens for each sequence are stored in
    self.numTokens.

    The order of sequences is already specified by the network data files; if
    generated by the experiment, these are in order or randomized as specified
    by the orderedSplit arg.
    """
    super(BucketHTMRunner, self).partitionIndices(
      seed=seed, numInference=numInference)

    # Get the number of tokens in each bucket file so the network knows how many
    # iterations to run. The order of buckets in self.bucketFiles is not
    # necessarily the same
    ndg = NetworkDataGenerator()
    for dataFile in self.bucketFiles:
      self.numTokens.append(ndg.getNumberOfTokens(dataFile))
  def partitionIndices(self, _):
    """
    Sets self.partitions for the number of tokens for each sample in the
    training and test sets.

    The order of sequences is already specified by the network data files; if
    generated by the experiment, these are in order or randomized as specified
    by the orderedSplit arg.
    """
    if self.experimentType == "k-folds":
      for fold in xrange(self.folds):
        dataFile = self.dataFiles[fold]
        numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
        self.partitions = KFolds(self.folds).split(numTokens, randomize=False)
    else:
      for trial, split in enumerate(self.trainSizes):
        dataFile = self.dataFiles[trial]
        numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
        self.partitions.append((numTokens[:split], numTokens[split:]))
Example #15
0
    def partitionIndices(self, _):
        """
    Sets self.partitions for the number of tokens for each sample in the
    training and test sets.

    The order of sequences is already specified by the network data files; if
    generated by the experiment, these are in order or randomized as specified
    by the orderedSplit arg.
    """
        if self.experimentType == "k-folds":
            for fold in xrange(self.folds):
                dataFile = self.dataFiles[fold]
                numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
                self.partitions = KFolds(self.folds).split(numTokens,
                                                           randomize=False)
        else:
            for trial, split in enumerate(self.trainSizes):
                dataFile = self.dataFiles[trial]
                numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
                self.partitions.append((numTokens[:split], numTokens[split:]))
Example #16
0
 def _getClassifications(self, iteration):
     """
 Get the classifications for a particular iteration.
 @param iteration  (int)       Iteration of the experiment.
 @return           (list)      List of list of ids of classifications for a
                               sample.
 """
     dataFile = self.dataFiles[iteration]
     classifications = NetworkDataGenerator.getClassifications(dataFile)
     return [[int(c) for c in classes.strip().split(" ")]
             for classes in classifications]
Example #17
0
    def _deSerializeExtraData(self, extraDataDir):
        """
    Protected method that is called during deserialization (after __setstate__)
    with an external directory path. We override it here to load the Network API
    instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network = Network(os.path.join(extraDataDir, "network.nta"))
        self._initializeRegionHelpers()
        self.networkDataGen = NetworkDataGenerator()
Example #18
0
    def __init__(
        self,
        networkConfig,
        inputFilePath,
        retinaScaling=1.0,
        retina="en_associative",
        apiKey=None,
        verbosity=1,
        numLabels=3,
        modelDir="ClassificationModelHTM",
        prepData=True,
        stripCats=False,
        cacheRoot=None,
    ):
        """
    @param networkConfig      (dict)    Network configuration dict with region
                                        parameters.
    @param inputFilePath      (str)     Path to data file.
    @param retinaScaling      (float)   Scales the dimensions of the SDRs.
    @param retina             (str)     Name of Cio retina.
    @param apiKey             (str)     Key for Cio API.
    @param prepData           (bool)    Prepare the input data into network API
                                        format.
    @param stripCats          (bool)    Remove the categories and replace them
                                        with the sequence_Id.
    @param cacheRoot          (str)     Root cache directory for CioEncoder
    See ClassificationModel for remaining parameters.

    Note classifierMetric is not specified here as it is in other models. This
    is done in the network config file.
    """
        super(ClassificationModelHTM, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

        self.networkConfig = networkConfig
        self.retinaScaling = retinaScaling
        self.retina = retina
        self.apiKey = apiKey
        self.inputFilePath = inputFilePath

        self.networkDataGen = NetworkDataGenerator()
        if prepData:
            self.networkDataPath = self.prepData(self.inputFilePath, stripCats=stripCats)
        else:
            self.networkDataPath = self.inputFilePath

        self.cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

        self.network = self.initModel()
        self._initializeRegionHelpers()
Example #19
0
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.csv")
        ndg.split(filename, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
  def __init__(self,
               networkConfig,
               inputFilePath,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelHTM",
               prepData=True,
               stripCats=False):
    """
    @param networkConfig      (dict)    Network configuration dict with region
                                        parameters.
    @param inputFilePath      (str)     Path to data file.
    @param retinaScaling      (float)   Scales the dimensions of the SDRs.
    @param retina             (str)     Name of Cio retina.
    @param apiKey             (str)     Key for Cio API.
    @param prepData           (bool)    Prepare the input data into network API
                                        format.
    @param stripCats          (bool)    Remove the categories and replace them
                                        with the sequence_Id.
    See ClassificationModel for remaining parameters.
    """
    super(ClassificationModelHTM, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    self.networkConfig = networkConfig
    self.retinaScaling = retinaScaling
    self.retina = retina
    self.apiKey = apiKey

    self.networkDataGen = NetworkDataGenerator()
    if prepData:
      self.networkDataPath = self.prepData(inputFilePath, stripCats=stripCats)
    else:
      self.networkDataPath = inputFilePath

    self.network = self.initModel()
    self.learningRegions = self._getLearningRegions()

    # Always a sensor and classifier region.
    self.sensorRegion = self.network.regions[
      self.networkConfig["sensorRegionConfig"].get("regionName")]
    self.classifierRegion = self.network.regions[
      self.networkConfig["classifierRegionConfig"].get("regionName")]
Example #21
0
    def testFileRecordStreamReadData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        ndg.saveData(dataOutputFile, categoriesOutputFile)

        # If no error is raised, then the data is in the correct format
        frs = FileRecordStream(dataOutputFile)
Example #22
0
    def testing(self, trial, seed):
        """
    Test the network on the test set for a particular trial and store the
    results
    @param trial      (int)       trial count
    """
        if self.verbosity > 0:
            i = sum(self.partitions[trial][0])
            indices = []
            for numTokens in self.partitions[trial][1]:
                indices.append(i)
                i += numTokens
            print(
                "\tRunner selects to test on sequences starting at indices "
                "{}".format(indices))

        results = ([], [])
        testIndex = len(self.partitions[trial][0])
        for numTokens in self.partitions[trial][1]:
            predictions = []
            activations = []
            for _ in xrange(numTokens):
                predicted, active = self.model.testModel(seed)
                activations.append(active)
                predictions.append(predicted)
            winningPredictions = self._selectWinners(predictions, activations)

            # TODO: switch to standard (expected, actual) format
            results[0].append(winningPredictions)
            results[1].append(self.actualLabels[trial][testIndex])
            testIndex += 1

        # Prepare data for writeOutClassifications
        trainIdx = range(len(self.partitions[trial][0]))
        testIdx = range(
            len(self.partitions[trial][0]),
            len(self.partitions[trial][0]) + len(self.partitions[trial][1]))
        self.partitions[trial] = (trainIdx, testIdx)
        self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial])

        self.results.append(results)
  def testing(self, trial, seed):
    """
    Test the network on the test set for a particular trial and store the
    results
    @param trial      (int)       trial count
    """
    if self.verbosity > 0:
      i = sum(self.partitions[trial][0])
      indices = []
      for numTokens in self.partitions[trial][1]:
        indices.append(i)
        i += numTokens
      print ("\tRunner selects to test on sequences starting at indices "
             "{}".format(indices))

    results = ([], [])
    testIndex = len(self.partitions[trial][0])
    for numTokens in self.partitions[trial][1]:
      predictions = []
      activations = []
      for _ in xrange(numTokens):
        predicted, active = self.model.testModel(seed)
        activations.append(active)
        predictions.append(predicted)
      winningPredictions = self._selectWinners(predictions, activations)

      # TODO: switch to standard (expected, actual) format
      results[0].append(winningPredictions)
      results[1].append(self.actualLabels[trial][testIndex])
      testIndex += 1

    # Prepare data for writeOutClassifications
    trainIdx = range(len(self.partitions[trial][0]))
    testIdx = range(len(self.partitions[trial][0]),
      len(self.partitions[trial][0]) + len(self.partitions[trial][1]))
    self.partitions[trial] = (trainIdx, testIdx)
    self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial])

    self.results.append(results)
Example #24
0
    def testRandomize(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)

        random.seed(1)
        ndg.randomizeData()

        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        randomizedIDs = []
        dataTable = pandas.read_csv(dataOutputFile)
        for _, values in dataTable.iterrows():
            record = values.to_dict()
            idx = record["_sequenceId"]
            if idx.isdigit() and (not randomizedIDs
                                  or randomizedIDs[-1] != idx):
                randomizedIDs.append(idx)

        self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
    def testRandomize(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)

        random.seed(1)
        ndg.randomizeData()

        dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        randomizedIDs = []
        dataTable = pandas.read_csv(dataOutputFile)
        for _, values in dataTable.iterrows():
            record = values.to_dict()
            idx = record["_sequenceId"]
            if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx):
                randomizedIDs.append(idx)

        self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
Example #26
0
    def testSplitNoPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        ndg.split(filename, 3, False)
        self.assertRecordsEqual(ndg.records, self.expected)
Example #27
0
class ClassificationModelHTM(ClassificationModel):
    """Classify text using generic network-API based models."""

    def __init__(
        self,
        networkConfig,
        inputFilePath,
        retinaScaling=1.0,
        retina="en_associative",
        apiKey=None,
        verbosity=1,
        numLabels=3,
        modelDir="ClassificationModelHTM",
        prepData=True,
        stripCats=False,
        cacheRoot=None,
    ):
        """
    @param networkConfig      (dict)    Network configuration dict with region
                                        parameters.
    @param inputFilePath      (str)     Path to data file.
    @param retinaScaling      (float)   Scales the dimensions of the SDRs.
    @param retina             (str)     Name of Cio retina.
    @param apiKey             (str)     Key for Cio API.
    @param prepData           (bool)    Prepare the input data into network API
                                        format.
    @param stripCats          (bool)    Remove the categories and replace them
                                        with the sequence_Id.
    @param cacheRoot          (str)     Root cache directory for CioEncoder
    See ClassificationModel for remaining parameters.

    Note classifierMetric is not specified here as it is in other models. This
    is done in the network config file.
    """
        super(ClassificationModelHTM, self).__init__(verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

        self.networkConfig = networkConfig
        self.retinaScaling = retinaScaling
        self.retina = retina
        self.apiKey = apiKey
        self.inputFilePath = inputFilePath

        self.networkDataGen = NetworkDataGenerator()
        if prepData:
            self.networkDataPath = self.prepData(self.inputFilePath, stripCats=stripCats)
        else:
            self.networkDataPath = self.inputFilePath

        self.cacheRoot = cacheRoot or os.path.dirname(os.path.realpath(__file__))

        self.network = self.initModel()
        self._initializeRegionHelpers()

    def getClassifier(self):
        """
    Returns the classifier for the model.
    """
        return self.classifierRegion.getSelf().getAlgorithmInstance()

    def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
        """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    """
        networkDataPath = self.networkDataGen.setupData(dataPath, self.numLabels, ordered, stripCats, **kwargs)

        return networkDataPath

    def initModel(self):
        """
    Initialize the network; self.networdDataPath must already be set.
    """
        if self.networkDataPath is not None:
            recordStream = FileRecordStream(streamID=self.networkDataPath)
        else:
            recordStream = None

        encoder = CioEncoder(
            retinaScaling=self.retinaScaling,
            cacheDir=os.path.join(self.cacheRoot, "CioCache"),
            retina=self.retina,
            apiKey=self.apiKey,
            verbosity=self.verbosity - 1,
        )

        # This encoder specifies the LanguageSensor output width.
        return configureNetwork(recordStream, self.networkConfig, encoder)

    def _initializeRegionHelpers(self):
        """
    Set helper member variables once network has been initialized. This will
    also be called from _deSerializeExtraData()
    """
        learningRegions = []
        for region in self.network.regions.values():
            spec = region.getSpec()
            if spec.parameters.contains("learningMode"):
                learningRegions.append(region)

        # Always a sensor and classifier region.
        self.sensorRegion = self.network.regions[self.networkConfig["sensorRegionConfig"].get("regionName")]
        self.classifierRegion = self.network.regions[self.networkConfig["classifierRegionConfig"].get("regionName")]

        # There is sometimes a TP region
        self.tpRegion = None
        if self.networkConfig.has_key("tpRegionConfig"):
            self.tpRegion = self.network.regions[self.networkConfig["tpRegionConfig"].get("regionName")]

        self.learningRegions = learningRegions

        self.network.enableProfiling()

    # TODO: is this still needed?
    def encodeSample(self, sample):
        """
    Put each token in its own dictionary with its bitmap
    @param sample     (list)            Tokenized sample, where each item is a
                                        string token.
    @return           (list)            The sample text, sparsity, and bitmap
                                        for each token. Since the network will
                                        do the actual encoding, the bitmap and
                                        sparsity will be None
    Example return list:
      [{
        "text": "Example text",
        "sparsity": 0.0,
        "bitmap": None
      }]
    """
        return [{"text": t, "sparsity": None, "bitmap": None} for t in sample]

    def resetModel(self):
        """
    Reset the model by creating a new network since the network API does not
    support resets.
    """
        # TODO: test this works as expected
        self.network = self.initModel()

    def saveModel(self, trial=None):
        try:
            if not os.path.exists(self.modelDir):
                os.makedirs(self.modelDir)
            if trial:
                netPath = os.path.join(self.modelDir, "network_{}.nta".format(trial))
            else:
                netPath = os.path.join(self.modelDir, "network.nta")
            self.network.save(netPath)
            if self.verbosity > 0:
                print "Model saved to '{}'.".format(netPath)
        except IOError as e:
            print "Could not save model to '{}'.".format(netPath)
            raise e

    def trainModel(self, iterations=1):
        """
    Run the network with all regions learning.
    Note self.sampleReference doesn't get populated b/c in a network model
    there's a 1-to-1 mapping of training samples.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", True)

        self.network.run(iterations)

    def trainNetwork(self, iterations):
        """Run the network with all regions learning but the classifier."""
        for region in self.learningRegions:
            if region.name == "classifier":
                region.setParameter("learningMode", False)
            else:
                region.setParameter("learningMode", True)

        self.network.run(iterations)

    def classifyNetwork(self, iterations):
        """
    For running after the network has been trained by trainNetwork(), this
    populates the KNN prototype space with the final network representations.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)

        sensor = self.sensorRegion.getSelf()
        sensor.rewind()

        self.classifierRegion.setParameter("learningMode", True)
        self.classifierRegion.setParameter("inferenceMode", True)

        sequenceIds = []
        for _ in xrange(iterations):
            self.network.run(1)
            sequenceIds.append(sensor.getOutputValues("sequenceIdOut")[0])

        return sequenceIds

    def inferNetwork(self, iterations, fileRecord=None, learn=False):
        """
    Run the network to infer distances to the classified samples.

    @param fileRecord (str)     If you want to change the file record stream.
    @param learn      (bool)    The classifier will learn the inferred sequnce.
    """
        if fileRecord:
            self.swapRecordStream(fileRecord)

        self.classifierRegion.setParameter("learningMode", learn)
        self.classifierRegion.setParameter("inferenceMode", True)

        sampleDistances = None
        for i in xrange(iterations):
            self.network.run(1)
            inferenceValues = self.classifierRegion.getOutputData("categoriesOut")
            # Sum together the inferred distances for each word of the sequence.
            if sampleDistances is None:
                sampleDistances = inferenceValues
            else:
                sampleDistances += inferenceValues

        return sampleDistances

    def swapRecordStream(self, dataPath):
        """Change the data source for the network's sensor region."""
        recordStream = FileRecordStream(streamID=dataPath)
        sensor = self.sensorRegion.getSelf()
        sensor.dataSource = recordStream  # TODO: implement this in network API

    def testModel(self, seed=42):
        """
    Test the classifier region on the input sample. Call this method for each
    word of a sequence. The random seed is used in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
        self.classifierRegion.setParameter("inferenceMode", True)

        self.network.run(1)

        inference = self._getClassifierInference(seed)
        activityBitmap = self.classifierRegion.getInputData("bottomUpIn")

        return inference, activityBitmap

    def _getClassifierInference(self, seed):
        """Return output categories from the classifier region."""
        relevantCats = self.classifierRegion.getParameter("categoryCount")

        if self.classifierRegion.type == "py.KNNClassifierRegion":
            # max number of inferences = k
            inferenceValues = self.classifierRegion.getOutputData("categoriesOut")[:relevantCats]
            return self.getWinningLabels(inferenceValues, seed)

        elif self.classifierRegion.type == "py.CLAClassifierRegion":
            # TODO: test this
            return self.classifierRegion.getOutputData("categoriesOut")[:relevantCats]

    def queryModel(self, query, preprocess=False):
        """
    Run the query through the network, getting the classifer region's inferences
    for all words of the query sequence.
    @return       (list)          Two-tuples of sequence ID and distance, sorted
                                  closest to farthest from the query.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
        self.classifierRegion.setParameter("inferenceMode", True)

        # Put query text in LanguageSensor data format.
        queryDicts = self.networkDataGen.generateSequence(query, preprocess)

        sensor = self.sensorRegion.getSelf()
        sampleDistances = None
        for qD in queryDicts:
            # Sum together the inferred distances for each word of the query sequence.
            sensor.queue.appendleft(qD)
            self.network.run(1)
            inferenceValues = self.classifierRegion.getOutputData("categoriesOut")
            if sampleDistances is None:
                sampleDistances = inferenceValues
            else:
                sampleDistances += inferenceValues

        catCount = self.classifierRegion.getParameter("categoryCount")
        # The use of numpy.lexsort() here is to first sort by randomValues, then
        # sort by random values; this breaks ties in a random manner.
        randomValues = numpy.random.random(catCount)
        sortedSamples = numpy.lexsort((randomValues, sampleDistances[:catCount]))
        qTuple = [(a, b) for a, b in zip(sortedSamples, sampleDistances[:catCount])]

        return sorted(qTuple, key=operator.itemgetter(1))

    def tokenize(self, text, preprocess=False):
        """
    Given a bunch of text (could be several sentences) return a single list
    containing individual tokens. Text is tokenized using the CIO tokenize
    method.

    @param text         (str)     A bunch of text.
    @param preprocess   (bool)    Whether or not to preprocess the text data.
    """
        encoder = self.sensorRegion.getSelf().encoder
        sentenceList = encoder.client.tokenize(text)
        tokenList = []
        for sentence in sentenceList:
            tokenList.extend(sentence.split(","))
        return tokenList

    def reset(self):
        """
    Issue a reset signal to the model. The assumption is that a sequence has
    just ended and a new sequence is about to begin.  The default behavior is
    to do nothing - not all subclasses may re-implement this.
    """
        # TODO: Introduce a consistent reset method name.
        for r in self.learningRegions:
            if r.type == "py.TemporalPoolerRegion":
                r.executeCommand(["reset"])
            elif r.type == "py.TPRegion":
                r.executeCommand(["resetSequenceStates"])

    def trainText(self, token, labels, sequenceId=None, reset=0):
        """
    Train the model with the given text token, associated labels, and
    sequence ID.

    @param token      (str)  The text token to train on
    @param labels     (list) A list of one or more integer labels associated
                             with this token. If the list is empty, the
                             classifier will not be trained.
    @param sequenceId (int)  An integer ID associated with this token and its
                             sequence (document).
    @param reset      (int)  Should be 0 or 1. If 1, assumes we are at the
                             beginning of a new sequence.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", True)
        sensor = self.sensorRegion.getSelf()
        sensor.addDataToQueue(token, labels, sequenceId, 0)
        self.network.run(1)

        # Print the outputs of each region
        if self.verbosity >= 2:
            self.printRegionOutputs()

        if reset == 1:
            self.reset()

    def classifyText(self, token, reset=0):
        """
    Classify the token and return a list of the best classifications.

    @param token    (str)  The text token to train on
    @param reset    (int)  Should be 0 or 1. If 1, assumes we are at the
                           end of a sequence. A reset signal will be issued
                           after the model has been trained on this token.

    @return  (numpy array) An array of size numLabels. Position i contains
                           the likelihood that this sample belongs to the
                           i'th category. An array containing all zeros
                           implies no decision could be made.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
            region.setParameter("inferenceMode", True)
        sensor = self.sensorRegion.getSelf()
        sensor.addDataToQueue(token, [None], -1, 0)
        self.network.run(1)

        # Print the outputs of each region
        if self.verbosity >= 2:
            self.printRegionOutputs()

        if reset == 1:
            self.reset()

        return self.classifierRegion.getOutputData("categoriesOut")[0 : self.numLabels]

    def printRegionOutputs(self):
        """
    Print the outputs of regions to console for debugging, depending on
    verbosity level.
    """

        print "================== HTM Debugging output:"
        print "Sensor output:",
        print self.sensorRegion.getOutputData("dataOut").nonzero()
        print "Sensor categoryOut:",
        print self.sensorRegion.getOutputData("categoryOut")

        if self.verbosity >= 3:
            if self.tpRegion is not None:
                print "TP region input:",
                print self.tpRegion.getInputData("activeCells").nonzero()
                print "TP region output:",
                print self.tpRegion.getOutputData("mostActiveCells").nonzero()

            print "Classifier bottomUpIn: ",
            print self.classifierRegion.getInputData("bottomUpIn").nonzero()
            print "Classifier categoryIn: ",
            print self.classifierRegion.getInputData("categoryIn")[0 : self.numLabels]

        print "Classifier categoriesOut: ",
        print self.classifierRegion.getOutputData("categoriesOut")[0 : self.numLabels]
        print "Classifier categoryProbabilitiesOut",
        print self.classifierRegion.getOutputData("categoryProbabilitiesOut")[0 : self.numLabels]

    def dumpProfile(self):
        """
    Print region profiling information in a nice format.
    """
        print "Profiling information for {}".format(type(self).__name__)
        totalTime = 0.000001
        for region in self.network.regions.values():
            timer = region.computeTimer
            totalTime += timer.getElapsed()

        profileInfo = []
        for region in self.network.regions.values():
            timer = region.computeTimer
            profileInfo.append(
                [region.name, timer.getStartCount(), timer.getElapsed(), 100.0 * timer.getElapsed() / totalTime]
            )

        profileInfo.append(["Total time", "", totalTime, "100.0"])
        print tabulate(profileInfo, headers=["Region", "Count", "Elapsed", "Percent of total"], tablefmt="grid")

    def __getstate__(self):
        """
    Return serializable state.  This function will return a version of the
    __dict__ with data that shouldn't be pickled stripped out. For example,
    Network API instances are stripped out because they have their own
    serialization mechanism.

    See also: _serializeExtraData()
    """
        state = self.__dict__.copy()
        # Remove member variables that we can't pickle
        state.pop("network")
        state.pop("sensorRegion")
        state.pop("classifierRegion")
        state.pop("tpRegion")
        state.pop("learningRegions")
        state.pop("networkDataGen")

        return state

    def _serializeExtraData(self, extraDataDir):
        """
    Protected method that is called during serialization with an external
    directory path. We override it here to save the Network API instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network.save(os.path.join(extraDataDir, "network.nta"))

    def _deSerializeExtraData(self, extraDataDir):
        """
    Protected method that is called during deserialization (after __setstate__)
    with an external directory path. We override it here to load the Network API
    instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network = Network(os.path.join(extraDataDir, "network.nta"))
        self._initializeRegionHelpers()
        self.networkDataGen = NetworkDataGenerator()
  def testSplitNoPreprocess(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

    ndg.split(filename, 3, False)
    self.assertRecordsEqual(ndg.records, self.expected)
  animals, vegetables = getAnimalVegetableList()
  vegetable = {}
  animal = {}
  tmCellUnion = []
  tmInputUnion = []
  tpOutput = []
  categoryLabel = []
  accuracy = []
  accuracyTp = []
  knnInLastNSequences = 20
  knnNumber = 1

  plt.close('all')
  plt.figure(1)
  plt.show()
  numTokens = NetworkDataGenerator.getNumberOfTokens(args.dataPath)
  for numSample in xrange(len(numTokens)):
    # union SDR for this sequence
    tmCellActivation = np.zeros((tmRegion._tfdr.cellsPerColumn * tmRegion._tfdr.columnDimensions[0],))
    tmInputActivation = np.zeros((tmRegion._tfdr.columnDimensions[0],))
    print
    for word in xrange(numTokens[numSample]):
      sensorInput = None
      sensorOutput = {'categoryOut': np.array([0]),
                      'resetOut': [None],
                      'sourceOut': None,
                      'sequenceIdOut': [None],
                      'encodingOut': None,
                      'dataOut': np.zeros((sensorRegion.encoder.n, ))}
      sensorRegion.compute(sensorInput, sensorOutput)
class ClassificationModelHTM(ClassificationModel):
  """Class to run the classification experiments with HTM network models."""

  def __init__(self,
               networkConfig,
               inputFilePath,
               retinaScaling=1.0,
               retina="en_associative",
               apiKey=None,
               verbosity=1,
               numLabels=3,
               modelDir="ClassificationModelHTM",
               prepData=True,
               stripCats=False):
    """
    @param networkConfig      (dict)    Network configuration dict with region
                                        parameters.
    @param inputFilePath      (str)     Path to data file.
    @param retinaScaling      (float)   Scales the dimensions of the SDRs.
    @param retina             (str)     Name of Cio retina.
    @param apiKey             (str)     Key for Cio API.
    @param prepData           (bool)    Prepare the input data into network API
                                        format.
    @param stripCats          (bool)    Remove the categories and replace them
                                        with the sequence_Id.
    See ClassificationModel for remaining parameters.
    """
    super(ClassificationModelHTM, self).__init__(
      verbosity=verbosity, numLabels=numLabels, modelDir=modelDir)

    self.networkConfig = networkConfig
    self.retinaScaling = retinaScaling
    self.retina = retina
    self.apiKey = apiKey

    self.networkDataGen = NetworkDataGenerator()
    if prepData:
      self.networkDataPath = self.prepData(inputFilePath, stripCats=stripCats)
    else:
      self.networkDataPath = inputFilePath

    self.network = self.initModel()
    self.learningRegions = self._getLearningRegions()

    # Always a sensor and classifier region.
    self.sensorRegion = self.network.regions[
      self.networkConfig["sensorRegionConfig"].get("regionName")]
    self.classifierRegion = self.network.regions[
      self.networkConfig["classifierRegionConfig"].get("regionName")]


  def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
    """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    """
    networkDataPath = self.networkDataGen.setupData(
      dataPath, self.numLabels, ordered, stripCats, **kwargs)

    return networkDataPath


  def initModel(self):
    """
    Initialize the network; self.networdDataPath must already be set.
    """
    recordStream = FileRecordStream(streamID=self.networkDataPath)
    root = os.path.dirname(os.path.realpath(__file__))
    encoder = CioEncoder(retinaScaling=self.retinaScaling,
                         cacheDir=os.path.join(root, "CioCache"),
                         retina=self.retina,
                         apiKey=self.apiKey)

    # This encoder specifies the LanguageSensor output width.
    return configureNetwork(recordStream, self.networkConfig, encoder)


  def _getLearningRegions(self):
    """Return tuple of the network's region objects that learn."""
    learningRegions = []
    for region in self.network.regions.values():
      spec = region.getSpec()
      if spec.parameters.contains('learningMode'):
        learningRegions.append(region)

    return learningRegions


  # TODO: is this still needed?
  def encodeSample(self, sample):
    """
    Put each token in its own dictionary with its bitmap
    @param sample     (list)            Tokenized sample, where each item is a
                                        string token.
    @return           (list)            The sample text, sparsity, and bitmap
                                        for each token. Since the network will
                                        do the actual encoding, the bitmap and
                                        sparsity will be None
    Example return list:
      [{
        "text": "Example text",
        "sparsity": 0.0,
        "bitmap": None
      }]
    """
    return [{"text": t,
             "sparsity": None,
             "bitmap": None} for t in sample]


  def resetModel(self):
    """
    Reset the model by creating a new network since the network API does not
    support resets.
    """
    # TODO: test this works as expected
    self.network = self.initModel()


  def saveModel(self, trial=None):
    try:
      if not os.path.exists(self.modelDir):
        os.makedirs(self.modelDir)
      if trial:
        netPath = os.path.join(self.modelDir, "network_{}.nta".format(trial))
      else:
        netPath = os.path.join(self.modelDir, "network.nta")
      self.network.save(netPath)
      # with open(netPath, "wb") as f:
      #   pkl.dump(self, f)
      if self.verbosity > 0:
        print "Model saved to '{}'.".format(netPath)
    except IOError as e:
      print "Could not save model to '{}'.".format(netPath)
      raise e


  def trainModel(self, iterations=1):
    """
    Run the network with all regions learning.
    Note self.sampleReference doesn't get populated b/c in a network model
    there's a 1-to-1 mapping of training samples.
    """
    for region in self.learningRegions:
      region.setParameter("learningMode", True)

    self.network.run(iterations)


  def trainNetwork(self, iterations):
    """Run the network with all regions learning but the classifier."""
    for region in self.learningRegions:
      if region.name == "classifier":
        region.setParameter("learningMode", False)
      else:
        region.setParameter("learningMode", True)

    self.network.run(iterations)


  def classifyNetwork(self, iterations):
    """
    For running after the network has been trained by trainNetwork(), this
    populates the KNN prototype space with the final network representations.
    """
    for region in self.learningRegions:
      region.setParameter("learningMode", False)

    sensor = self.sensorRegion.getSelf()
    sensor.rewind()

    self.classifierRegion.setParameter("learningMode", True)
    self.classifierRegion.setParameter("inferenceMode", True)

    sequenceIds = []
    for _ in xrange(iterations):
      self.network.run(1)
      sequenceIds.append(sensor.getOutputValues("sequenceIdOut")[0])

    return sequenceIds


  def inferNetwork(self, iterations, fileRecord=None, learn=False):
    """
    Run the network to infer distances to the classified samples.

    @param fileRecord (str)     If you want to change the file record stream.
    @param learn      (bool)    The classifier will learn the inferred sequnce.
    """
    if fileRecord:
      self.swapRecordStream(fileRecord)

    self.classifierRegion.setParameter("learningMode", learn)
    self.classifierRegion.setParameter("inferenceMode", True)

    sampleDistances = None
    for i in xrange(iterations):
      self.network.run(1)
      inferenceValues = self.classifierRegion.getOutputData("categoriesOut")
      # Sum together the inferred distances for each word of the sequence.
      if sampleDistances is None:
        sampleDistances = inferenceValues
      else:
        sampleDistances += inferenceValues

    return sampleDistances


  def swapRecordStream(self, dataPath):
    """Change the data source for the network's sensor region."""
    recordStream = FileRecordStream(streamID=dataPath)
    sensor = self.sensorRegion.getSelf()
    sensor.dataSource = recordStream  # TODO: implement this in network API


  def testModel(self, seed=42):
    """
    Test the classifier region on the input sample. Call this method for each
    word of a sequence. The random seed is used in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
    for region in self.learningRegions:
      region.setParameter("learningMode", False)
    self.classifierRegion.setParameter("inferenceMode", True)

    self.network.run(1)

    inference = self._getClassifierInference(seed)
    activityBitmap = self.classifierRegion.getInputData("bottomUpIn")

    return inference, activityBitmap


  def _getClassifierInference(self, seed):
    """Return output categories from the classifier region."""
    relevantCats = self.classifierRegion.getParameter("categoryCount")

    if self.classifierRegion.type == "py.KNNClassifierRegion":
      # max number of inferences = k
      inferenceValues = self.classifierRegion.getOutputData(
        "categoriesOut")[:relevantCats]
      return self.getWinningLabels(inferenceValues, seed)

    elif self.classifierRegion.type == "py.CLAClassifierRegion":
      # TODO: test this
      return self.classifierRegion.getOutputData("categoriesOut")[:relevantCats]


  def queryModel(self, query, preprocess=False):
    """
    Run the query through the network, getting the classifer region's inferences
    for all words of the query sequence.
    @return       (list)          Two-tuples of sequence ID and distance, sorted
                                  closest to farthest from the query.
    """
    for region in self.learningRegions:
      region.setParameter("learningMode", False)
    self.classifierRegion.setParameter("inferenceMode", True)

    # Put query text in LanguageSensor data format.
    queryDicts = self.networkDataGen.generateSequence(query, preprocess)

    sensor = self.sensorRegion.getSelf()
    sampleDistances = None
    for qD in queryDicts:
      # Sum together the inferred distances for each word of the query sequence.
      sensor.queue.appendleft(qD)
      self.network.run(1)
      inferenceValues = self.classifierRegion.getOutputData("categoriesOut")
      if sampleDistances is None:
        sampleDistances = inferenceValues
      else:
        sampleDistances += inferenceValues

    catCount = self.classifierRegion.getParameter("categoryCount")
    # The use of numpy.lexsort() here is to first sort by randomValues, then
    # sort by random values; this breaks ties in a random manner.
    randomValues = numpy.random.random(catCount)
    sortedSamples = numpy.lexsort((randomValues, sampleDistances[:catCount]))
    qTuple = [(a, b) for a, b in zip(sortedSamples, sampleDistances[:catCount])]

    return sorted(qTuple, key=operator.itemgetter(1))
Example #31
0
  def setupNetData(
    self, generateData=True, seed=42, preprocess=False, **kwargs):
    """
    Resulting network data files created:
      - One for each bucket
      - One for each training rep, where samples are not repeated in a given
      file. Each samples is given its own category (_category = _sequenceId).

    The classification json is saved when generating the final training file.
    """
    if generateData:
      ndg = NetworkDataGenerator()
      self.dataDict = ndg.split(
        filePath=self.dataPath, numLabels=1, textPreprocess=preprocess,
        **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}_categories.json".format(filename)

      # Generate test data files: one network data file for each bucket.
      bucketFilePaths = bucketCSVs(self.dataPath)
      for bucketFile in bucketFilePaths:
        ndg.reset()
        ndg.split(
          filePath=bucketFile, numLabels=1, textPreprocess=preprocess, **kwargs)
        bucketFileName, ext = os.path.splitext(bucketFile)
        if not self.orderedSplit:
          # the sequences will be written to the file in random order
          ndg.randomizeData(seed)
        dataFile = "{}_network{}".format(bucketFileName, ext)
        ndg.saveData(dataFile, self.classificationFile)  # the classification file here gets (correctly) overwritten later
        self.bucketFiles.append(dataFile)

      # Generate training data file(s).
      self.trainingDicts = []
      uniqueDataDict = OrderedDict()
      included = []
      seqID = 0
      for dataEntry in self.dataDict.values():
        uniqueID = dataEntry[2]
        if uniqueID not in included:
          # skip over the samples that are repeated in multiple buckets
          uniqueDataDict[seqID] = dataEntry
          included.append(uniqueID)
          seqID += 1
      self.trainingDicts.append(uniqueDataDict)

      ndg.reset()
      ndg.split(
        dataDict=uniqueDataDict, numLabels=1, textPreprocess=preprocess,
        **kwargs)
      for rep in xrange(self.trainingReps):
        # use a different file for each training rep
        if not self.orderedSplit:
          ndg.randomizeData(seed)
        ndg.stripCategories()  # replace the categories w/ seqId
        dataFile = "{}_network_training_{}{}".format(filename, rep, ext)
        ndg.saveData(dataFile, self.classificationFile)
        self.dataFiles.append(dataFile)

      # TODO: maybe add a method (and arg) for removing all these data files

    else:
      # TODO (only if needed)
      raise NotImplementedError("Must generate data.")

    # labels references match the classification json
    self.mapLabelRefs()
Example #32
0
class ClassificationModelHTM(ClassificationModel):
    """Classify text using generic network-API based models."""
    def __init__(self,
                 networkConfig,
                 inputFilePath,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelHTM",
                 prepData=True,
                 stripCats=False):
        """
    @param networkConfig      (dict)    Network configuration dict with region
                                        parameters.
    @param inputFilePath      (str)     Path to data file.
    @param retinaScaling      (float)   Scales the dimensions of the SDRs.
    @param retina             (str)     Name of Cio retina.
    @param apiKey             (str)     Key for Cio API.
    @param prepData           (bool)    Prepare the input data into network API
                                        format.
    @param stripCats          (bool)    Remove the categories and replace them
                                        with the sequence_Id.
    See ClassificationModel for remaining parameters.

    Note classifierMetric is not specified here as it is in other models. This
    is done in the network config file.
    """
        super(ClassificationModelHTM, self).__init__(verbosity=verbosity,
                                                     numLabels=numLabels,
                                                     modelDir=modelDir)

        self.networkConfig = networkConfig
        self.retinaScaling = retinaScaling
        self.retina = retina
        self.apiKey = apiKey
        self.inputFilePath = inputFilePath

        self.networkDataGen = NetworkDataGenerator()
        if prepData:
            self.networkDataPath = self.prepData(self.inputFilePath,
                                                 stripCats=stripCats)
        else:
            self.networkDataPath = self.inputFilePath

        self.network = self.initModel()
        self._initializeRegionHelpers()

    def getClassifier(self):
        """
    Returns the classifier for the model.
    """
        return self.classifierRegion.getSelf().getAlgorithmInstance()

    def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
        """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    """
        networkDataPath = self.networkDataGen.setupData(
            dataPath, self.numLabels, ordered, stripCats, **kwargs)

        return networkDataPath

    def initModel(self):
        """
    Initialize the network; self.networdDataPath must already be set.
    """
        if self.networkDataPath is not None:
            recordStream = FileRecordStream(streamID=self.networkDataPath)
        else:
            recordStream = None

        root = os.path.dirname(os.path.realpath(__file__))
        encoder = CioEncoder(retinaScaling=self.retinaScaling,
                             cacheDir=os.path.join(root, "CioCache"),
                             retina=self.retina,
                             apiKey=self.apiKey)

        # This encoder specifies the LanguageSensor output width.
        return configureNetwork(recordStream, self.networkConfig, encoder)

    def _initializeRegionHelpers(self):
        """
    Set helper member variables once network has been initialized. This will
    also be called from _deSerializeExtraData()
    """
        learningRegions = []
        for region in self.network.regions.values():
            spec = region.getSpec()
            if spec.parameters.contains('learningMode'):
                learningRegions.append(region)

        # Always a sensor and classifier region.
        self.sensorRegion = self.network.regions[
            self.networkConfig["sensorRegionConfig"].get("regionName")]
        self.classifierRegion = self.network.regions[
            self.networkConfig["classifierRegionConfig"].get("regionName")]

        # There is sometimes a TP region
        self.tpRegion = None
        if self.networkConfig.has_key("tpRegionConfig"):
            self.tpRegion = self.network.regions[
                self.networkConfig["tpRegionConfig"].get("regionName")]

        self.learningRegions = learningRegions

    # TODO: is this still needed?
    def encodeSample(self, sample):
        """
    Put each token in its own dictionary with its bitmap
    @param sample     (list)            Tokenized sample, where each item is a
                                        string token.
    @return           (list)            The sample text, sparsity, and bitmap
                                        for each token. Since the network will
                                        do the actual encoding, the bitmap and
                                        sparsity will be None
    Example return list:
      [{
        "text": "Example text",
        "sparsity": 0.0,
        "bitmap": None
      }]
    """
        return [{"text": t, "sparsity": None, "bitmap": None} for t in sample]

    def resetModel(self):
        """
    Reset the model by creating a new network since the network API does not
    support resets.
    """
        # TODO: test this works as expected
        self.network = self.initModel()

    def saveModel(self, trial=None):
        try:
            if not os.path.exists(self.modelDir):
                os.makedirs(self.modelDir)
            if trial:
                netPath = os.path.join(self.modelDir,
                                       "network_{}.nta".format(trial))
            else:
                netPath = os.path.join(self.modelDir, "network.nta")
            self.network.save(netPath)
            if self.verbosity > 0:
                print "Model saved to '{}'.".format(netPath)
        except IOError as e:
            print "Could not save model to '{}'.".format(netPath)
            raise e

    def trainModel(self, iterations=1):
        """
    Run the network with all regions learning.
    Note self.sampleReference doesn't get populated b/c in a network model
    there's a 1-to-1 mapping of training samples.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", True)

        self.network.run(iterations)

    def trainNetwork(self, iterations):
        """Run the network with all regions learning but the classifier."""
        for region in self.learningRegions:
            if region.name == "classifier":
                region.setParameter("learningMode", False)
            else:
                region.setParameter("learningMode", True)

        self.network.run(iterations)

    def classifyNetwork(self, iterations):
        """
    For running after the network has been trained by trainNetwork(), this
    populates the KNN prototype space with the final network representations.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)

        sensor = self.sensorRegion.getSelf()
        sensor.rewind()

        self.classifierRegion.setParameter("learningMode", True)
        self.classifierRegion.setParameter("inferenceMode", True)

        sequenceIds = []
        for _ in xrange(iterations):
            self.network.run(1)
            sequenceIds.append(sensor.getOutputValues("sequenceIdOut")[0])

        return sequenceIds

    def inferNetwork(self, iterations, fileRecord=None, learn=False):
        """
    Run the network to infer distances to the classified samples.

    @param fileRecord (str)     If you want to change the file record stream.
    @param learn      (bool)    The classifier will learn the inferred sequnce.
    """
        if fileRecord:
            self.swapRecordStream(fileRecord)

        self.classifierRegion.setParameter("learningMode", learn)
        self.classifierRegion.setParameter("inferenceMode", True)

        sampleDistances = None
        for i in xrange(iterations):
            self.network.run(1)
            inferenceValues = self.classifierRegion.getOutputData(
                "categoriesOut")
            # Sum together the inferred distances for each word of the sequence.
            if sampleDistances is None:
                sampleDistances = inferenceValues
            else:
                sampleDistances += inferenceValues

        return sampleDistances

    def swapRecordStream(self, dataPath):
        """Change the data source for the network's sensor region."""
        recordStream = FileRecordStream(streamID=dataPath)
        sensor = self.sensorRegion.getSelf()
        sensor.dataSource = recordStream  # TODO: implement this in network API

    def testModel(self, seed=42):
        """
    Test the classifier region on the input sample. Call this method for each
    word of a sequence. The random seed is used in getWinningLabels().

    @return           (numpy array)   numLabels most-frequent classifications
                                      for the data samples; int or empty.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
        self.classifierRegion.setParameter("inferenceMode", True)

        self.network.run(1)

        inference = self._getClassifierInference(seed)
        activityBitmap = self.classifierRegion.getInputData("bottomUpIn")

        return inference, activityBitmap

    def _getClassifierInference(self, seed):
        """Return output categories from the classifier region."""
        relevantCats = self.classifierRegion.getParameter("categoryCount")

        if self.classifierRegion.type == "py.KNNClassifierRegion":
            # max number of inferences = k
            inferenceValues = self.classifierRegion.getOutputData(
                "categoriesOut")[:relevantCats]
            return self.getWinningLabels(inferenceValues, seed)

        elif self.classifierRegion.type == "py.CLAClassifierRegion":
            # TODO: test this
            return self.classifierRegion.getOutputData(
                "categoriesOut")[:relevantCats]

    def queryModel(self, query, preprocess=False):
        """
    Run the query through the network, getting the classifer region's inferences
    for all words of the query sequence.
    @return       (list)          Two-tuples of sequence ID and distance, sorted
                                  closest to farthest from the query.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
        self.classifierRegion.setParameter("inferenceMode", True)

        # Put query text in LanguageSensor data format.
        queryDicts = self.networkDataGen.generateSequence(query, preprocess)

        sensor = self.sensorRegion.getSelf()
        sampleDistances = None
        for qD in queryDicts:
            # Sum together the inferred distances for each word of the query sequence.
            sensor.queue.appendleft(qD)
            self.network.run(1)
            inferenceValues = self.classifierRegion.getOutputData(
                "categoriesOut")
            if sampleDistances is None:
                sampleDistances = inferenceValues
            else:
                sampleDistances += inferenceValues

        catCount = self.classifierRegion.getParameter("categoryCount")
        # The use of numpy.lexsort() here is to first sort by randomValues, then
        # sort by random values; this breaks ties in a random manner.
        randomValues = numpy.random.random(catCount)
        sortedSamples = numpy.lexsort(
            (randomValues, sampleDistances[:catCount]))
        qTuple = [(a, b)
                  for a, b in zip(sortedSamples, sampleDistances[:catCount])]

        return sorted(qTuple, key=operator.itemgetter(1))

    def reset(self):
        """
    Issue a reset signal to the model. The assumption is that a sequence has
    just ended and a new sequence is about to begin.  The default behavior is
    to do nothing - not all subclasses may re-implement this.
    """
        # TODO: Introduce a consistent reset method name.
        for r in self.learningRegions:
            if r.type == 'py.TemporalPoolerRegion':
                r.executeCommand(['reset'])
            elif r.type == 'py.TPRegion':
                r.executeCommand(['resetSequenceStates'])

    def trainText(self, token, labels, sequenceId=None, reset=0):
        """
    Train the model with the given text token, associated labels, and
    sequence ID.

    @param token      (str)  The text token to train on
    @param labels     (list) A list of one or more integer labels associated
                             with this token. If the list is empty, the
                             classifier will not be trained.
    @param sequenceId (int)  An integer ID associated with this token and its
                             sequence (document).
    @param reset      (int)  Should be 0 or 1. If 1, assumes we are at the
                             beginning of a new sequence.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", True)
        sensor = self.sensorRegion.getSelf()
        sensor.addDataToQueue(token, labels, sequenceId, 0)
        self.network.run(1)

        # Print the outputs of each region
        if self.verbosity >= 2:
            self.printRegionOutputs()

        if reset == 1:
            self.reset()

    def classifyText(self, token, reset=0):
        """
    Classify the token and return a list of the best classifications.

    @param token    (str)  The text token to train on
    @param reset    (int)  Should be 0 or 1. If 1, assumes we are at the
                           end of a sequence. A reset signal will be issued
                           after the model has been trained on this token.

    @return  (numpy array) An array of size numLabels. Position i contains
                           the likelihood that this sample belongs to the
                           i'th category. An array containing all zeros
                           implies no decision could be made.
    """
        for region in self.learningRegions:
            region.setParameter("learningMode", False)
            region.setParameter("inferenceMode", True)
        sensor = self.sensorRegion.getSelf()
        sensor.addDataToQueue(token, [None], -1, 0)
        self.network.run(1)

        # Print the outputs of each region
        if self.verbosity >= 2:
            self.printRegionOutputs()

        if reset == 1:
            self.reset()

        return self.classifierRegion.getOutputData(
            "categoriesOut")[0:self.numLabels]

    def printRegionOutputs(self):
        """
    Print the outputs of regions to console for debugging, depending on
    verbosity level.
    """

        print "================== HTM Debugging output:"
        print "Sensor output:",
        print self.sensorRegion.getOutputData("dataOut").nonzero()
        print "Sensor categoryOut:",
        print self.sensorRegion.getOutputData("categoryOut")

        if self.verbosity >= 3:
            if self.tpRegion is not None:
                print "TP region input:",
                print self.tpRegion.getInputData("activeCells").nonzero()
                print "TP region output:",
                print self.tpRegion.getOutputData("mostActiveCells").nonzero()

            print "Classifier bottomUpIn: ",
            print self.classifierRegion.getInputData("bottomUpIn").nonzero()
            print "Classifier categoryIn: ",
            print self.classifierRegion.getInputData(
                "categoryIn")[0:self.numLabels]

        print "Classifier categoriesOut: ",
        print self.classifierRegion.getOutputData(
            "categoriesOut")[0:self.numLabels]
        print "Classifier categoryProbabilitiesOut",
        print self.classifierRegion.getOutputData(
            "categoryProbabilitiesOut")[0:self.numLabels]

    def __getstate__(self):
        """
    Return serializable state.  This function will return a version of the
    __dict__ with data that shouldn't be pickled stripped out. For example,
    Network API instances are stripped out because they have their own
    serialization mechanism.

    See also: _serializeExtraData()
    """
        state = self.__dict__.copy()
        # Remove member variables that we can't pickle
        state.pop("network")
        state.pop("sensorRegion")
        state.pop("classifierRegion")
        state.pop("tpRegion")
        state.pop("learningRegions")
        state.pop("networkDataGen")

        return state

    def _serializeExtraData(self, extraDataDir):
        """
    Protected method that is called during serialization with an external
    directory path. We override it here to save the Network API instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network.save(os.path.join(extraDataDir, "network.nta"))

    def _deSerializeExtraData(self, extraDataDir):
        """
    Protected method that is called during deserialization (after __setstate__)
    with an external directory path. We override it here to load the Network API
    instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network = Network(os.path.join(extraDataDir, "network.nta"))
        self._initializeRegionHelpers()
        self.networkDataGen = NetworkDataGenerator()