コード例 #1
0
  def setupData(self, preprocess=False, sampleIdx=2, **kwargs):
    """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters
    """
    if self.generateData:
      ndg = NetworkDataGenerator()
      ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess,
        **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}-classifications.json".format(filename)
      for i in xrange(len(self.trainSize)):
        if not self.orderedSplit:
          ndg.randomizeData()
        dataFile = "{}-{}{}".format(filename, i, ext)
        ndg.saveData(dataFile, self.classificationFile)
        self.dataFiles.append(dataFile)

      if self.verbosity > 0:
        print "{} file(s) generated at {}".format(len(self.dataFiles),
          self.dataFiles)
        print "Classification json is at: {}".format(self.classificationFile)
    else:
      # Does an orderedSplit
      self.dataFiles = [self.dataPath] * len(self.trainSize)

    self.actualLabels = [self._getClassifications(size, i)
      for i, size in enumerate(self.trainSize)]

    self._mapLabelRefs()
コード例 #2
0
    def testRandomize(self):
        ndg = NetworkDataGenerator()
        filename = (
            self.dirName +
            "/../../../data/sample_reviews_multi/sample_reviews_data_training.csv"
        )
        ndg.split(filename, 2, 3, False)

        random.seed(1)
        ndg.randomizeData()

        dataOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_split.csv")
        categoriesOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_categories.json"
        )
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        randomizedIDs = []
        dataTable = pandas.read_csv(dataOutputFile)
        for _, values in dataTable.iterrows():
            record = values.to_dict()
            idx = record["_sequenceID"]
            if idx.isdigit() and (not randomizedIDs
                                  or randomizedIDs[-1] != idx):
                randomizedIDs.append(idx)

        self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
コード例 #3
0
    def setupData(self, preprocess=False, sampleIdx=2, **kwargs):
        """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters
    """
        if self.generateData:
            ndg = NetworkDataGenerator()
            ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess,
                      **kwargs)

            filename, ext = os.path.splitext(self.dataPath)
            self.classificationFile = "{}-classifications.json".format(
                filename)
            for i in xrange(len(self.trainSize)):
                if not self.orderedSplit:
                    ndg.randomizeData()
                dataFile = "{}-{}{}".format(filename, i, ext)
                ndg.saveData(dataFile, self.classificationFile)
                self.dataFiles.append(dataFile)

            if self.verbosity > 0:
                print "{} file(s) generated at {}".format(
                    len(self.dataFiles), self.dataFiles)
                print "Classification json is at: {}".format(
                    self.classificationFile)
        else:
            # Does an orderedSplit
            self.dataFiles = [self.dataPath] * len(self.trainSize)

        self.actualLabels = [
            self._getClassifications(size, i)
            for i, size in enumerate(self.trainSize)
        ]

        self._mapLabelRefs()
コード例 #4
0
  def testRandomize(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    ndg.split(filename, 3, False)

    random.seed(1)
    ndg.randomizeData()

    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.json")
    success = ndg.saveData(dataOutputFile, categoriesOutputFile)

    randomizedIDs = []
    dataTable = pandas.read_csv(dataOutputFile)
    for _, values in dataTable.iterrows():
      record = values.to_dict()
      idx = record["_sequenceID"]
      if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx):
        randomizedIDs.append(idx)

    self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
コード例 #5
0
ファイル: htm_runner.py プロジェクト: numenta/nupic.fluent
  def setupNetData(self, preprocess=False, generateData=False, **kwargs):
    """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters.
    """
    if generateData:
      # TODO: use model.prepData()?
      ndg = NetworkDataGenerator()
      ndg.split(self.dataPath, self.numClasses, preprocess, **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}_categories.json".format(filename)

      for i in xrange(len(self.trainSizes)):
        if not self.orderedSplit:
          ndg.randomizeData()
        dataFile = "{}_network_{}{}".format(filename, i, ext)
        ndg.saveData(dataFile, self.classificationFile)
        self.dataFiles.append(dataFile)

      if self.verbosity > 0:
        print "{} file(s) generated at {}".format(len(self.dataFiles),
          self.dataFiles)
        print "Classification JSON is at: {}".format(self.classificationFile)
    else:
      # Use the input file for each trial; maintains the order of samples.
      self.dataFiles = [self.dataPath] * len(self.trainSizes)

    if self.numClasses > 0:
      # Setup labels data objects
      self.actualLabels = [self._getClassifications(size, i)
        for i, size in enumerate(self.trainSizes)]
      self._mapLabelRefs()