Exemple #1
0
    def train(self):
        """
    Train the network regions on the entire dataset.
    There should be one datafile for each training rep in self.dataFiles, where
    every data sample (i.e. sequence) appears only once in each file.
    """
        # TODO: ignore patterns < minSparsity (= 0.9 * unionSparsity)
        if self.trainingReps != len(self.dataFiles):
            raise RuntimeError(
                "Mismatch between the number of specified training "
                "reps and the number of data files (should be 1:1).")

        for dataFile in self.dataFiles:
            if self.verbosity > 0:
                print "Running all the data through the network for training..."
            self.model.swapRecordStream(dataFile)
            numTokens = NetworkDataGenerator().getNumberOfTokens(dataFile)
            n = sum(numTokens)
            self.model.trainNetwork(n)

        # Populate the classifier space by running through the current data file;
        # learning (in other regions) is turned off by the model.
        if self.verbosity > 1:
            print "Populating the classifier with all of the sequences."
        self.classifiedSeqIds = self.model.classifyNetwork(n)
Exemple #2
0
    def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        expected = [[{
            "_token": "gohbkchoo",
            "_category": "0 1",
            "_sequenceId": 0,
            "ID": "1",
            "_reset": 1
        }],
                    [{
                        "_token": "o",
                        "_category": "2",
                        "_sequenceId": 1,
                        "ID": "2",
                        "_reset": 1
                    }, {
                        "_token": "ca",
                        "_category": "2",
                        "_sequenceId": 1,
                        "ID": "2",
                        "_reset": 0
                    }]]

        ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
Exemple #3
0
    def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs):
        # TODO: use model.prepData()?
        ndg = NetworkDataGenerator()
        self.dataDict = ndg.split(filePath=self.dataPath,
                                  numLabels=self.numClasses,
                                  textPreprocess=preprocess,
                                  **kwargs)

        filename, ext = os.path.splitext(self.dataPath)
        self.classificationFile = "{}_categories.json".format(filename)

        # Generate one data file for each experiment iteration.
        if self.experimentType == "k-folds" and not self.orderedSplit:
            # only randomize the data order once for k-folds cross validation
            ndg.randomizeData(seed)
        for i in xrange(splits):
            if self.experimentType != "k-folds" and not self.orderedSplit:
                ndg.randomizeData(seed)
                seed += 1
            # ext='.csv'
            dataFile = "{}_network_{}{}".format(filename, i, ext)
            ndg.saveData(dataFile, self.classificationFile)
            self.dataFiles.append(dataFile)

        if self.verbosity > 0:
            print "{} file(s) generated at {}".format(len(self.dataFiles),
                                                      self.dataFiles)
            print "Classification JSON is at: {}".format(
                self.classificationFile)
Exemple #4
0
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.csv")
        ndg.split(filename, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
Exemple #5
0
    def _deSerializeExtraData(self, extraDataDir):
        """
    Protected method that is called during deserialization (after __setstate__)
    with an external directory path. We override it here to load the Network API
    instance.

    @param extraDataDir (string) Model's extra data directory path
    """
        self.network = Network(os.path.join(extraDataDir, "network.nta"))
        self._initializeRegionHelpers()
        self.networkDataGen = NetworkDataGenerator()
Exemple #6
0
    def testFileRecordStreamReadData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        ndg.saveData(dataOutputFile, categoriesOutputFile)

        # If no error is raised, then the data is in the correct format
        frs = FileRecordStream(dataOutputFile)
Exemple #7
0
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_category": "list",
            "_token": "string",
            "_sequenceId": "int",
            "_reset": "int",
            "ID": "string"
        }
        specials = {
            "_category": "C",
            "_token": "",
            "_sequenceId": "S",
            "_reset": "R",
            "ID": ""
        }

        expected_records = [
            record for data in self.expected for record in data
        ]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if idx > 1:
                # csv values are strings, so cast the ints
                record["_sequenceId"] = int(record["_sequenceId"])
                record["_reset"] = int(record["_reset"])
            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        }
        self.assertDictEqual(categories, expected_categories)
Exemple #8
0
    def __init__(self,
                 networkConfig,
                 inputFilePath,
                 retinaScaling=1.0,
                 retina="en_associative",
                 apiKey=None,
                 verbosity=1,
                 numLabels=3,
                 modelDir="ClassificationModelHTM",
                 prepData=True,
                 stripCats=False):
        """
    @param networkConfig      (dict)    Network configuration dict with region
                                        parameters.
    @param inputFilePath      (str)     Path to data file.
    @param retinaScaling      (float)   Scales the dimensions of the SDRs.
    @param retina             (str)     Name of Cio retina.
    @param apiKey             (str)     Key for Cio API.
    @param prepData           (bool)    Prepare the input data into network API
                                        format.
    @param stripCats          (bool)    Remove the categories and replace them
                                        with the sequence_Id.
    See ClassificationModel for remaining parameters.

    Note classifierMetric is not specified here as it is in other models. This
    is done in the network config file.
    """
        super(ClassificationModelHTM, self).__init__(verbosity=verbosity,
                                                     numLabels=numLabels,
                                                     modelDir=modelDir)

        self.networkConfig = networkConfig
        self.retinaScaling = retinaScaling
        self.retina = retina
        self.apiKey = apiKey
        self.inputFilePath = inputFilePath

        self.networkDataGen = NetworkDataGenerator()
        if prepData:
            self.networkDataPath = self.prepData(self.inputFilePath,
                                                 stripCats=stripCats)
        else:
            self.networkDataPath = self.inputFilePath

        self.network = self.initModel()
        self._initializeRegionHelpers()
Exemple #9
0
  def partitionIndices(self, seed=42, numInference=10):
    """
    Sets self.partitions for the buckets' querying and ranking sets. The
    corresponding numbers of tokens for each sequence are stored in
    self.numTokens.

    The order of sequences is already specified by the network data files; if
    generated by the experiment, these are in order or randomized as specified
    by the orderedSplit arg.
    """
    super(BucketHTMRunner, self).partitionIndices(
      seed=seed, numInference=numInference)

    # Get the number of tokens in each bucket file so the network knows how many
    # iterations to run. The order of buckets in self.bucketFiles is not
    # necessarily the same
    ndg = NetworkDataGenerator()
    for dataFile in self.bucketFiles:
      self.numTokens.append(ndg.getNumberOfTokens(dataFile))
Exemple #10
0
    def testRandomize(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)

        random.seed(1)
        ndg.randomizeData()

        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        randomizedIDs = []
        dataTable = pandas.read_csv(dataOutputFile)
        for _, values in dataTable.iterrows():
            record = values.to_dict()
            idx = record["_sequenceId"]
            if idx.isdigit() and (not randomizedIDs
                                  or randomizedIDs[-1] != idx):
                randomizedIDs.append(idx)

        self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
Exemple #11
0
  def setupNetData(
    self, generateData=True, seed=42, preprocess=False, **kwargs):
    """
    Resulting network data files created:
      - One for each bucket
      - One for each training rep, where samples are not repeated in a given
      file. Each samples is given its own category (_category = _sequenceId).

    The classification json is saved when generating the final training file.
    """
    if generateData:
      ndg = NetworkDataGenerator()
      self.dataDict = ndg.split(
        filePath=self.dataPath, numLabels=1, textPreprocess=preprocess,
        **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}_categories.json".format(filename)

      # Generate test data files: one network data file for each bucket.
      bucketFilePaths = bucketCSVs(self.dataPath)
      for bucketFile in bucketFilePaths:
        ndg.reset()
        ndg.split(
          filePath=bucketFile, numLabels=1, textPreprocess=preprocess, **kwargs)
        bucketFileName, ext = os.path.splitext(bucketFile)
        if not self.orderedSplit:
          # the sequences will be written to the file in random order
          ndg.randomizeData(seed)
        dataFile = "{}_network{}".format(bucketFileName, ext)
        ndg.saveData(dataFile, self.classificationFile)  # the classification file here gets (correctly) overwritten later
        self.bucketFiles.append(dataFile)

      # Generate training data file(s).
      self.trainingDicts = []
      uniqueDataDict = OrderedDict()
      included = []
      seqID = 0
      for dataEntry in self.dataDict.values():
        uniqueID = dataEntry[2]
        if uniqueID not in included:
          # skip over the samples that are repeated in multiple buckets
          uniqueDataDict[seqID] = dataEntry
          included.append(uniqueID)
          seqID += 1
      self.trainingDicts.append(uniqueDataDict)

      ndg.reset()
      ndg.split(
        dataDict=uniqueDataDict, numLabels=1, textPreprocess=preprocess,
        **kwargs)
      for rep in xrange(self.trainingReps):
        # use a different file for each training rep
        if not self.orderedSplit:
          ndg.randomizeData(seed)
        ndg.stripCategories()  # replace the categories w/ seqId
        dataFile = "{}_network_training_{}{}".format(filename, rep, ext)
        ndg.saveData(dataFile, self.classificationFile)
        self.dataFiles.append(dataFile)

      # TODO: maybe add a method (and arg) for removing all these data files

    else:
      # TODO (only if needed)
      raise NotImplementedError("Must generate data.")

    # labels references match the classification json
    self.mapLabelRefs()
Exemple #12
0
    def testSplitNoPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        ndg.split(filename, 3, False)
        self.assertRecordsEqual(ndg.records, self.expected)